In [46]:
import random
import numpy as np
from itertools import permutations
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [17]:
all_posibilities = list(permutations(range(1, 9), 4))
action_space_size = len(all_posibilities)
print(f"action_space_size is: {action_space_size}")

action_space_size is: 1680


In [35]:
#Getting Feedback
def get_feedback(guess, password):
    green = sum(g == p for g, p in zip(guess, password))
    yellow = sum(min(guess.count(d), password.count(d)) for d in set(guess)) - green
    return green, yellow

In [45]:
#Reward Function
def reward_function(feedback):
    green, yellow = feedback
    return green * 10 + yellow * 2 - 5

In [42]:
#State Encoding
def get_state(guess, green, yellow):
    guess_vector = np.zeros(action_space_size)
    guess_index = all_posibilities.index(guess)
    guess_vector[guess_index] = 1
    feedback_vector = np.array([green, yellow]) / 4.0 #Normalization
    return np.concatenate([guess_vector, feedback_vector])

In [27]:
# Q-Network Construction
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.model = nn.Sequential(nn.Linear(input_dim, 512), nn.ReLU(), 
                                   nn.Linear(512, output_dim))
        
    def forward(self, x):
        return self.model(x)

In [31]:
learning_rate = 0.001
gamma = 0.95
alpha = 0.1
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.999
epochs = 3000

In [32]:
#initialization
state_dim = action_space_size + 2
q_net = QNetwork(state_dim, action_space_size)
optimizer = optim.Adam(q_net.parameters(), lr = learning_rate)
loss_function = nn.MSELoss()

In [44]:
#Training Process
all_rewards = []
all_steps = []

for each_round in range(epochs):   
    password = random.choice(all_posibilities)     #generate initial password
    done = False                                   #not stop when guess corrected
    total_reward = 0
    steps = 0
    history = []
    
    while not done and steps < 30:
        #get state
        if history:                           #history-based state
            prev_guess, (g, y) = history[-1]
            state = torch.tensor(get_state(prev_guess, g, y), dtype = torch.float32)
        else:
            state = torch.zeros(state_dim)    #initial state
            
        #e-greedy strategy
        if random.random() < epsilon:
            action_index = random.randint(0, action_space_size - 1)  #exploration
        else:
            with torch.no_grad():
                q_values = q_net(state.unsqueeze(0))
                action_index = torch.argmax(q_values).item()         #utilization
                
        guess = all_posibilities[action_index]
        feedback = get_feedback(guess, password)
        #break
        if feedback[0] == 4:
            reward = 100
            done = True
        else:
            reward = reward_function(feedback)
        total_reward += reward
        steps += 1
        
        #update state
        next_state = torch.tensor(get_state(guess, *feedback), dtype = torch.float32)
        
        #update q_net_param using Bellman Function
        with torch.no_grad():
            target_q = reward + gamma * torch.max(q_net(next_state.unsqueeze(0)))   
        predicted_q = q_net(state.unsqueeze(0))[0, action_index]
        loss = loss_function(predicted_q, target_q)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #update history
        history.append((guess, feedback))
        
    
    #rewards/steps storage
    all_rewards.append(total_reward)
    all_steps.append(steps)
    #update epsilon
    epsilon = max(epsilon * epsilon_decay, epsilon_min)
    
    if (each_round + 1) % 100 == 0:
         print(f"Episode {each_round + 1}, Steps: {steps}, Reward: {total_reward}, Epsilon: {epsilon:.3f}")
            

NameError: name 'episode' is not defined