In [68]:
import torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import wandb

In [58]:
X_temp = torch.load("hangman_ml_X.pt")

In [59]:
print(X_temp.shape)

torch.Size([3821871, 88])


In [60]:

def preprocess_input_vector(X):
    """
    Preprocess the input vector X of shape (num_samples, 88) by:
    - Converting -1 values (hidden tokens) to 37
    - Converting -2 values (padding tokens) to 38
    
    Args:
        X (torch.Tensor): Input tensor of shape (num_samples, 88)
        
    Returns:
        torch.Tensor: Processed tensor with same shape, with -1 and -2 replaced
    """
    # Create a copy to avoid modifying the original tensor
    X_processed = X.clone()
    
    # Only modify the first 50 positions which contain the word state
    word_state = X_processed[:, :50]
    
    # Replace -1 (hidden tokens) with 37
    word_state[word_state == -1] = 37
    
    # Replace -2 (padding tokens) with 38
    word_state[word_state == -2] = 38
    
    # The modified word state is already part of X_processed
    return X_processed

In [7]:
X_temp2 = preprocess_input_vector(X_temp)

In [13]:
torch.save(X_temp2, "X_train.pt")

In [69]:
"""
if X_train.pt file exists already then skip the above and run the code below
"""

'\nif X_train.pt file exists already then skip the above and run the code below\n'

In [70]:
X_train = torch.load("X_train.pt")
y_train = torch.load("hangman_ml_y.pt")

In [71]:
X_train.shape

torch.Size([3821871, 88])

In [72]:
y_train.shape

torch.Size([3821871])

In [73]:
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Config

# Wide but shallow Transformer Policy Network for Hangman
class HangmanTransformerPolicy(nn.Module):
    def __init__(self, vocab_size=37, hidden_size=1024, num_layers=2, max_length=50):
        super(HangmanTransformerPolicy, self).__init__()
        self.max_length = max_length
        self.vocab_size = vocab_size
        
        # Define token values
        self.HIDDEN_TOKEN = 37
        self.PADDING_TOKEN = 38
        
        # Embedding for the word state (includes normal tokens 0-36, hidden token 37, and padding token 38)
        self.word_embedding = nn.Embedding(vocab_size + 2, hidden_size)
        
        # Transformer for processing the word state
        self.config = GPT2Config(
            vocab_size=vocab_size,
            n_positions=max_length,
            n_embd=hidden_size,
            n_layer=num_layers,  # Shallow (not deep)
            n_head=16,  # Wide attention heads
            n_inner=hidden_size*4  # Wide feed-forward layers
        )
        self.transformer = GPT2Model(self.config)
        
        # Linear projection for incorrectly guessed letters
        self.incorrect_projection = nn.Linear(37, hidden_size)
        
        # Embedding for lives (0-10)
        self.lives_embedding = nn.Embedding(11, hidden_size)
        
        # Final policy head (action logits)
        self.policy_head = nn.Linear(hidden_size * 3, vocab_size)
        
    def forward(self, x):
        batch_size = x.shape[0]
        
        # Split the input into its components
        word_state = x[:, :50].long()  # First 50 positions for the word
        incorrect_guesses = x[:, 50:87].float()  # Next 37 positions for incorrect guesses
        lives = x[:, 87].long()  # Last position for lives
        
        # Process word state
        word_embeds = self.word_embedding(word_state)
        
        # Create attention mask for padding (padding token is 38)
        attention_mask = (word_state != self.PADDING_TOKEN).float()
        
        # Pass through transformer
        transformer_outputs = self.transformer(
            inputs_embeds=word_embeds,
            attention_mask=attention_mask
        )
        hidden_states = transformer_outputs.last_hidden_state
        
        # Get global representation by mean pooling
        masked_hidden = hidden_states * attention_mask.unsqueeze(-1)
        sum_hidden = masked_hidden.sum(dim=1)
        word_features = sum_hidden / (attention_mask.sum(dim=1, keepdim=True) + 1e-10)
        
        # Process incorrectly guessed letters
        incorrect_features = self.incorrect_projection(incorrect_guesses)
        
        # Process lives
        lives_features = self.lives_embedding(lives)
        
        # Combine all features
        combined_features = torch.cat([word_features, incorrect_features, lives_features], dim=1)
        
        # Get action logits
        action_logits = self.policy_head(combined_features)
        
        # # Create masks for invalid actions
        # # 1. Determine correct guesses based on word state
        # # (if a character appears in word state and is not the hidden token, it's been guessed correctly)
        # correct_guesses = torch.zeros(batch_size, self.vocab_size, device=x.device)
        
        # for b in range(batch_size):
        #     for i in range(50):  # For each position in the word
        #         if 0 <= word_state[b, i] < self.vocab_size:  # If it's a revealed character (0-36)
        #             correct_guesses[b, word_state[b, i]] = 1
        
        # # 2. Combine with incorrect guesses to get all guessed characters
        # all_guessed = correct_guesses | incorrect_guesses.bool()
        
        # # 3. Mask out already guessed characters
        # action_logits = action_logits.masked_fill(all_guessed.bool(), -1e10)
        
        return action_logits

In [74]:
X_train.shape

torch.Size([3821871, 88])

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import wandb
import time
import torch_xla.core.xla_model as xm
from tqdm import tqdm

def train_model(model, X_tensor, y_tensor, epochs=50, batch_size=2048, learning_rate=0.0001):
    """
    Simple training loop for the Hangman Transformer model with wandb logging on TPU
    
    Args:
        model: The model instance to train
        X_train: Training features of shape (num_samples, 88)
        y_train: Training labels of shape (num_samples,)
        epochs: Number of training epochs
        batch_size: Batch size for training
        learning_rate: Learning rate for optimizer
    
    Returns:
        Trained model
    """
    # Initialize wandb
    wandb.init(project="hangman-transformer")
    
    # Get TPU device
    device = xm.xla_device()
    
    # Move model to TPU
    model = model.to(device)
    
    # Create optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    # Training loop
    num_samples = len(X_train)
    num_batches = (num_samples + batch_size - 1) // batch_size  # Ceiling division
    
    for epoch in tqdm(range(epochs)):
        epoch_start_time = time.time()
        model.train()
        running_loss = 0.0
        correct_predictions = 0
        
        # Process in batches
        for i in tqdm(range(num_batches)):
            # Get batch
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, num_samples)
            inputs = X_tensor[start_idx:end_idx]
            targets = y_tensor[start_idx:end_idx]
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            
            # Calculate loss
            loss = criterion(outputs, targets)
            
            # Backward pass
            loss.backward()
            
            # TPU-specific optimizer step
            xm.optimizer_step(optimizer)
            
            # Statistics
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            correct_predictions += (predicted == targets).sum().item()
        
        # Calculate epoch statistics
        epoch_loss = running_loss / num_batches
        epoch_accuracy = 100.0 * correct_predictions / num_samples
        epoch_time = time.time() - epoch_start_time
        
        # Log to wandb
        wandb.log({
            "epoch": epoch + 1,
            "loss": epoch_loss,
            "accuracy": epoch_accuracy,
            "time": epoch_time
        })
        
        # Print progress
        print(f'Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f}, '
              f'Accuracy: {epoch_accuracy:.2f}%, Time: {epoch_time:.2f}s')
    
    # Save model
    checkpoint_path = "hangman_transformer.pth"
    xm.save(model.state_dict(), checkpoint_path)
    wandb.save(checkpoint_path)
    
    # Finish wandb run
    wandb.finish()
    
    return model

# Example usage:
if __name__ == "__main__":
    # Create model
    model = HangmanTransformerPolicy()

    # Get TPU device
    device = xm.xla_device()

    X_train = X_train.to(device)
    y_train = y_train.to(device)
    
    # Train model
    trained_model = train_model(model, X_train, y_train)

  0%|                                                                                          | 0/50 [00:00<?, ?it/s]
  0%|                                                                                        | 0/1867 [00:00<?, ?it/s][A
  0%|                                                                              | 1/1867 [00:10<5:18:49, 10.25s/it][A
  0%|                                                                              | 2/1867 [00:32<9:02:33, 17.46s/it][A
  0%|                                                                             | 3/1867 [01:03<12:11:15, 23.54s/it][A
  0%|▏                                                                            | 4/1867 [01:47<16:21:30, 31.61s/it][A
  0%|▏                                                                            | 5/1867 [02:39<20:08:19, 38.94s/it][A
  0%|▏                                                                            | 6/1867 [03:41<24:14:00, 46.88s/it][A
  0%|▎                     

KeyboardInterrupt: 

In [75]:
# Get device
device = torch.device("xla")  # TPU device

In [76]:
device

device(type='xla')

In [31]:
device = xm.xla_device() if 'xla' in str(xm.xla_device()) else torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
device

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

# Assume we are running on TPU and the tensors are already on TPU.
is_tpu = True
device = xm.xla_device()

# Example dataset (X_train and y_train are already on the TPU)
dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset, batch_size=1024, shuffle=True)

# Wrap DataLoader with ParallelLoader so that it handles distribution across TPU cores
train_loader = pl.ParallelLoader(train_loader, [device]).per_device_loader(device)

# Create and move model to device (model parameters will be placed on TPU)
model = HangmanTransformerPolicy().to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 10
for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader):
        # Data is already on TPU; no additional .to(device) is needed.
        inputs, targets = batch

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        
        # For TPU, use the TPU-optimized step. Note: Do not call xm.mark_step() here,
        # because ParallelLoader already manages that internally.
        xm.optimizer_step(optimizer, barrier=True)
        
        total_loss += loss.item()
    
    # Reduce loss across TPU cores explicitly.
    total_loss = xm.mesh_reduce('loss_reduce', total_loss, sum)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")


  0%|                                                                                          | 0/10 [00:00<?, ?it/s]
  0%|                                                                                        | 0/3733 [00:00<?, ?it/s][A
  0%|                                                                                | 1/3733 [00:00<09:59,  6.23it/s][A
  0%|                                                                                | 2/3733 [00:00<10:05,  6.16it/s][A
  0%|                                                                                | 3/3733 [00:00<10:06,  6.15it/s][A
  0%|                                                                                | 4/3733 [00:00<10:06,  6.15it/s][A
  0%|                                                                                | 5/3733 [00:00<10:07,  6.14it/s][A
  0%|▏                                                                               | 6/3733 [00:00<10:07,  6.14it/s][A
  0%|▏                     

KeyboardInterrupt: 

In [11]:
X_train.shape

torch.Size([3821871, 88])

In [130]:
X_temp = X_train[:30,:]
y_temp = y_train[:30]

In [131]:
print(X_temp.shape, y_temp.shape)

torch.Size([30, 88]) torch.Size([30])


In [80]:
device = xm.xla_device()
X_temp = X_temp.to(device)
y_temp = y_temp.to(device)

In [81]:
y_preds = model(X_temp)

In [84]:
y_preds.shape

torch.Size([1, 37])

In [88]:
y_dis = torch.softmax(y_preds, dim=-1)

In [92]:
y_dis.shape

torch.Size([1, 37])

In [94]:
cat_pred = torch.multinomial(y_dis, num_samples=1).item()

In [95]:
cat_pred

15

In [90]:
torch.argmax(y_dis)

tensor(15, device='xla:0')

In [83]:
y_temp.shape

torch.Size([1])

In [55]:
loss = criterion(y_preds, y_temp)

In [56]:
loss.item()

0.13676178455352783

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import wandb

# Assume we are running on TPU and the tensors are already on TPU.
is_tpu = True
device = xm.xla_device()

# Example dataset (X_train and y_train are assumed to be on the TPU)
dataset = TensorDataset(X_temp, y_temp)
train_loader_base = DataLoader(dataset, batch_size=1024, shuffle=True)

# Wrap DataLoader with ParallelLoader so that it handles distribution across TPU cores.
# train_loader = pl.ParallelLoader(train_loader, [device]).per_device_loader(device)

# Create and move model to device (model parameters will be placed on TPU).
model = HangmanTransformerPolicy().to(device)

# Loss function and optimizer.
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 100
best_avg_loss = float('inf')


for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0.0
    num_batches = 0
    train_loader = pl.ParallelLoader(train_loader_base, [device]).per_device_loader(device)
    for batch in tqdm(train_loader):
        # Data is already on TPU; no additional .to(device) is needed.
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        xm.master_print(f'loss is {loss.item()}')

        # Use TPU-optimized optimizer step. (No manual xm.mark_step() needed.)
        xm.optimizer_step(optimizer, barrier=True)

        total_loss += loss.item()
        num_batches += 1

    # Reduce loss across TPU cores explicitly.
    xm.master_print(f"number of batches is {num_batches}")
    total_loss = xm.mesh_reduce('loss_reduce', total_loss, sum)
    avg_loss = total_loss / len(train_loader)
    xm.master_print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")


# checkpoint_path = f"hangman_transformer.pth"
# xm.save(model.state_dict(), checkpoint_path)
        


  0%|                                                                                         | 0/100 [00:00<?, ?it/s]
  0%|                                                                                           | 0/1 [00:00<?, ?it/s][A
  0%|                                                                                         | 0/100 [00:00<?, ?it/s]


RuntimeError: torch_xla/csrc/tensor.cpp:243 : Check failed: data()->tensor_data 
*** Begin stack trace ***
	tsl::CurrentStackTrace()
	torch_xla::XLATensor::GetXlaData()
	torch_xla::XLATensor::ToTensor(bool)
	torch_xla::XLANativeFunctions::_to_copy(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)
	
	at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)
	
	at::_ops::_to_copy::call(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)
	
	at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)
	
	
	at::_ops::_to_copy::call(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)
	at::native::to(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, bool, std::optional<c10::MemoryFormat>)
	
	at::_ops::to_dtype_layout::call(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, bool, std::optional<c10::MemoryFormat>)
	
	
	
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	
	
	
	
	
*** End stack trace ***


In [30]:
def _train_fn(para_loader, optimizer, model, criterion, device):
     for batch in para_loader:
        # Data is already on TPU; no additional .to(device) is needed.
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        # xm.master_print(f'loss is {loss.item()}')

        # Use TPU-optimized optimizer step. (No manual xm.mark_step() needed.)
        xm.optimizer_step(optimizer, barrier=True)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import wandb

# Assume we are running on TPU and the tensors are already on TPU.
is_tpu = True
device = xm.xla_device()

# Example dataset (X_train and y_train are assumed to be on the TPU)
dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset, batch_size=1024, shuffle=True)

# Wrap DataLoader with ParallelLoader so that it handles distribution across TPU cores.
# train_loader = pl.ParallelLoader(train_loader, [device]).per_device_loader(device)

# Create and move model to device (model parameters will be placed on TPU).
model = HangmanTransformerPolicy().to(device)

# Loss function and optimizer.
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 100


for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0.0
    num_batches = 0
    # train_loader = pl.ParallelLoader(train_loader, [device]).per_device_loader(device)
    _train_fn(train_loader, optimizer, model, criterion, device)

    # # Reduce loss across TPU cores explicitly.
    # xm.master_print(f"number of batches is {num_batches}")
    # total_loss = xm.mesh_reduce('loss_reduce', total_loss, sum)
    # avg_loss = total_loss / len(train_loader)
    # xm.master_print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")


checkpoint_path = f"hangman_transformer.pth"
xm.save(model.state_dict(), checkpoint_path)
        


  1%|▊                                                                            | 1/100 [07:47<12:50:36, 467.04s/it]

In [77]:
import random
import numpy as np
from collections import Counter

# Load a word list
def load_words(filename="words.txt", min_length=1):
    with open(filename) as f:
        words = [line.strip().lower() for line in f if len(line.strip()) >= min_length]
    return words

# Hangman Game Logic
class Hangman:
    def __init__(self, word):
        self.word = word
        self.guessed = set()
        self.incorrect_guesses = set()
        self.state = ["_" if c.isalpha() else c for c in word]

    def guess(self, letter):
        print(f"Guessing letter: {letter}")
        self.guessed.add(letter)
        if letter in self.word:
            print(f"{letter} is in the word!")
            for i, c in enumerate(self.word):
                if c == letter:
                    self.state[i] = letter
        else:
            print(f"{letter} is NOT in the word.")
            self.incorrect_guesses.add(letter)

    def get_pattern(self):
        return "".join(self.state)

    def is_solved(self):
        return "_" not in self.state

    def allowed_guesses(self):
        return set("abcdefghijklmnopqrstuvwxyz") - self.guessed

In [165]:
# Define vocabulary and create vocabulary index mapping
VOCABULARY = ['!', '&', "'", '-', '.', '/', '0', '1', '2', '3', '5',
              'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
              'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
              'w', 'x', 'y', 'z']
VOCAB_SIZE = len(VOCABULARY)
VOCAB_TO_IDX = {char: idx for idx, char in enumerate(VOCABULARY)}
IDX_TO_VOCAB = {idx: char for idx, char in enumerate(VOCABULARY)}

# Special token values
UNKNOWN_TOKEN = 37  # For unknown letters in the word ('_')
PADDING_TOKEN = 38  # For padding positions beyond the word length

class ModelMove:
    def __init__(self, model):
        self.model = model

    def encode_state(self, pattern, incorrect_guesses, lives_left):
        # Encode pattern as a fixed-length vector (padded to length 50)
        # Each position contains:
        # - vocab index for revealed letters
        # - UNKNOWN_TOKEN (-1) for unknown letters ('_')
        # - PADDING_TOKEN (-2) for padding beyond word length
        pattern_vector = []
        for char in pattern:
            if char == '_':
                pattern_vector.append(UNKNOWN_TOKEN)  # Use 37 for unknown letters
            else:
                pattern_vector.append(VOCAB_TO_IDX.get(char, UNKNOWN_TOKEN))  # Get vocab index

        # Pad to length 50 with PADDING_TOKEN
        pattern_vector += [PADDING_TOKEN] * (50 - len(pattern_vector))

        # Encode incorrect guesses as a binary vector
        incorrect_guesses_vector = [0] * VOCAB_SIZE
        for letter in incorrect_guesses:
            if letter in VOCAB_TO_IDX:
                incorrect_guesses_vector[VOCAB_TO_IDX[letter]] = 1

        features = pattern_vector + incorrect_guesses_vector + [lives_left]
        state_vector = torch.tensor(features, dtype=torch.float32)
        state_vector = state_vector.unsqueeze(0)
        # print(f'state_vector.shape={state_vector.shape}')
        # print(f'state_vector={state_vector}')
        self.state_vector = state_vector
        return state_vector


    def make_move(self, pattern, incorrect_guesses, lives_left):
        state_vector = self.encode_state(pattern, incorrect_guesses, lives_left)
        pred_logits = self.model(state_vector)
        pred_dist = torch.softmax(pred_logits, dim=-1)

        # Sample from distribution
        guess_index = torch.argmax(pred_dist).item()
        # guess_index = torch.multinomial(pred_dist, num_samples=1).item()

        # Get corresponding letter
        guess = IDX_TO_VOCAB[guess_index]

        return guess

In [166]:
ModelPlayer = ModelMove(model)

In [176]:
# Load a word list and split into training and validation sets
def load_words(filename="words.txt", min_length=5, split_ratio=0.9):
    with open(filename) as f:
        words = [line.strip().lower() for line in f if len(line.strip()) >= min_length]

    random.shuffle(words)
    split_idx = int(len(words) * split_ratio)
    training_set = words[:split_idx]
    validation_set = words[split_idx:]

    return training_set, validation_set

In [177]:
train_words, val_words = load_words()

In [208]:
train_sample = random.sample(train_words, k= 10)
val_sample = random.sample(val_words, k=1000)
print(f'\ntrain_sample={train_sample}\n')
print(f'\nval_sample={val_sample}\n')


train_sample=['tiber', 'edifications', 'amissible', 'finical', 'high-lineaged', 'spock', 'unbarricading', 'disparkle', 'semiflex', 'antiliberals']


val_sample=['linesman', 'chaferies', 'variating', 'sonneteeress', 'shantow', 'airburst', 'ungilded', 'photostereograph', 'storified', 'coffeepots', 'tear-', 'subvestment', 'nose-smart', 'kinderhook', 'five-flowered', 'narwhal', 'mouldies', 'appertise', 'subdemonstrate', 'esteemable', 'vondsira', 'forelooper', 'desde', 'undesign', 'preobviousness', 'racoons', 'mainstays', 'sparse', 'kneebrush', 'warranty', 'lateriflexion', 'twice-folded', 'out-of-course', 'boonie', 'snowcraft', 'phyllophaga', 'oceanfront', 'unclassifiable', 'argus', 'intumesce', 'neurohormone', 'omentitis', 'erical', 'beefwood', 'employer-owned', 'heathered', 'unvariedly', 'nasalizing', 'kendna', 'dramatics', 'nongraphicalness', 'chloroaurite', 'theezan', 'shradd', 'nonsaponifiable', 'puritandom', 'kecksies', 'fussle', 'bhaga', 'involuted', 'precedencies', 'accentuator', '

In [186]:
def contains_alpha(s):
    return any(c.isalpha() for c in s)

In [184]:
Player = Hangman(word)

In [216]:
model.eval()
state_vectors = []
num_games_played = 0
won = 0
for word in val_sample:
    Player = Hangman(word)
    lives_left = 6
    while (not Player.is_solved()) and (lives_left > 0):
        if (not contains_alpha(Player.state)) and Player.incorrect_guesses == set():
            print(f"Playing with word: {Player.word}\n")
        pattern = Player.state
        incorrect_guesses = Player.incorrect_guesses
        guess = ModelPlayer.make_move(pattern, incorrect_guesses, lives_left)
        if (guess not in Player.word) or guess in Player.state or guess in Player.incorrect_guesses:
            lives_left -= 1
        state_vectors.append(ModelPlayer.state_vector)
        Player.guess(guess)
        # print(f'number of lives is {lives_left}')
    
    if Player.is_solved():
        print("\nSolved!\n")
        won += 1
    else:
        print("\nFailed!\n")
    num_games_played += 1

print(F"GAMES ARE OVER! YOU SCORED {won / num_games_played * 100}%!")



Playing with word: linesman

Guessing letter: e
e is in the word!
Guessing letter: a
a is in the word!
Guessing letter: n
n is in the word!
Guessing letter: i
i is in the word!
Guessing letter: m
m is in the word!
Guessing letter: r
r is NOT in the word.
Guessing letter: h
h is NOT in the word.
Guessing letter: f
f is NOT in the word.
Guessing letter: c
c is NOT in the word.
Guessing letter: p
p is NOT in the word.
Guessing letter: g
g is NOT in the word.

Failed!

Playing with word: chaferies

Guessing letter: e
e is in the word!
Guessing letter: r
r is in the word!
Guessing letter: s
s is in the word!
Guessing letter: i
i is in the word!
Guessing letter: o
o is NOT in the word.
Guessing letter: a
a is in the word!
Guessing letter: f
f is in the word!
Guessing letter: c
c is in the word!
Guessing letter: h
h is in the word!

Solved!

Playing with word: variating

Guessing letter: e
e is NOT in the word.
Guessing letter: i
i is in the word!
Guessing letter: n
n is in the word!
Guessing

In [137]:
X_temp[10]

tensor([29., 13., 31., 28., 37., 19., 15., 29., 30., 38., 38., 38., 38., 38.,
        38., 38., 38., 38., 38., 38., 38., 38., 38., 38., 38., 38., 38., 38.,
        38., 38., 38., 38., 38., 38., 38., 38., 38., 38., 38., 38., 38., 38.,
        38., 38., 38., 38., 38., 38., 38., 38.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  3.])

In [157]:
state_vectors[0].shape

torch.Size([1, 88])

In [158]:
y_pred = model(state_vectors[0])

In [159]:
torch.argmax(y_pred)

tensor(15)

In [160]:
y_temp[0]

tensor(15)

In [161]:
IDX_TO_VOCAB[15]

'e'

In [144]:
word = ''
for i in range(9):
    letter = IDX_TO_VOCAB[X_temp[10][i].item()] if i != 4 else 'v'
    word += letter

print(word)

scurviest


In [141]:
IDX_TO_VOCAB

{0: '!',
 1: '&',
 2: "'",
 3: '-',
 4: '.',
 5: '/',
 6: '0',
 7: '1',
 8: '2',
 9: '3',
 10: '5',
 11: 'a',
 12: 'b',
 13: 'c',
 14: 'd',
 15: 'e',
 16: 'f',
 17: 'g',
 18: 'h',
 19: 'i',
 20: 'j',
 21: 'k',
 22: 'l',
 23: 'm',
 24: 'n',
 25: 'o',
 26: 'p',
 27: 'q',
 28: 'r',
 29: 's',
 30: 't',
 31: 'u',
 32: 'v',
 33: 'w',
 34: 'x',
 35: 'y',
 36: 'z'}