In [27]:
# Standard
from collections import namedtuple, deque
import random

# Third-party
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import trange
import gc
# knister api
from api import KnisterGame

In [28]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(device)

cpu


## Q network

In [29]:
class QNet(nn.Module):
    # Policy Network
    def __init__(self, n_state_vars, n_actions, dim_hidden=256):
        super(QNet, self).__init__()

        # Define a feedforward neural network with hidden layers, ReLU
        #  activations, and an output layer that maps to the number of actionsh
        # creo una rete ad imbuto in modo che il modello posso filtrare le azioni meno rilevanti per la stima dei Q-values
        self.fc = nn.Sequential(
            nn.Linear(n_state_vars, dim_hidden),
            nn.ReLU(),
            nn.Linear(dim_hidden, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, n_actions)
        )

    def forward(self, x):
        # Passes the input through the network layers to output Q-values
        return self.fc(x)

##  Replay Buffer

In [30]:
class ReplayBuffer:
    def __init__(self, n_actions, memory_size, batch_size):
        # Initialize actions, batch and experience template
        self.n_actions = n_actions
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        # Initialize the memory
        self.memory = deque(maxlen=memory_size)  # Hint: use deque

    def __len__(self):
        return len(self.memory)

    def add(self, state, action, reward, next_state, done):
        # Store experience in memory
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        # Sample a batch of experiences
        experiences = random.sample(self.memory, k=self.batch_size)  # Hint: use random

        # Convert to tensors for training
        states = torch.from_numpy(
            np.vstack([e.state for e in experiences if e is not None])
        ).float().to(device)

        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences if e is not None])
        ).long().to(device)

        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences if e is not None])
        ).float().to(device)

        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences if e is not None])
        ).float().to(device)

        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)
        ).float().to(device)

        # Return the tuple with all tensors
        return (states,actions,rewards,next_states,dones)

## DQN

In [31]:
class DQN:
    def __init__(
        self, n_states, n_actions, batch_size=64, learning_rate=1e-4,
        learn_step=5, gamma=0.99, mem_size=int(1e5), tau=1e-3
    ):
        # Core parameters for learning and updating the Q-network
        self.n_states = n_states
        self.n_actions = n_actions
        self.batch_size = batch_size
        self.gamma = gamma  # Discount factor for future rewards
        self.learn_step = learn_step  # Frequency of learning steps
        self.tau = tau  # Rate for soft updating the target network

        # Initialize the policy network (net_eval) and target network (net_target)
        self.net_eval = QNet(n_states,n_actions).to(device)  # Hint: we have a class for this
        self.net_target = QNet(n_states,n_actions).to(device)  # Hint: we have a class for this
        self.optimizer = optim.Adam(self.net_eval.parameters(),lr=learning_rate)  # Suggestion: use Adam from `optim` with specific learning rate
        self.criterion =  nn.MSELoss() # Suggestion: use Mean Squared Error (MSE) as the loss function

        # Initialize memory for experience replay
        self.memory = ReplayBuffer(n_actions,mem_size,batch_size)  # Hint: we have a class for this
        self.counter = 0  # Tracks learning steps for periodic updates

    def getAction(self, state,available_actions, epsilon):
        # Select action using an epsilon-greedy strategy to balance exploration
        #  and exploitation
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.net_eval.eval()  # Set network to evaluation mode
        with torch.no_grad():
            action_values = self.net_eval(state)
        self.net_eval.train()  # Return to training mode

        # Choose random action with probability epsilon, otherwise choose best
        #  predicted action
        if random.random() < epsilon:
            #action = random.choice(np.arange(self.n_actions))
            action = random.choice(available_actions)
        else:
            # Calcolo dei q_values
            q_values = action_values.cpu().data.numpy().squeeze()
            # Creiamo una maschera con -infinito
            mask = np.full(q_values.shape, -1e5)
            # Copiamo solo i valori delle azioni VALIDE nella maschera
            mask[available_actions] = q_values[available_actions]
            # Argmax sulla maschera sceglierà sempre un'azione valida
            action = np.argmax(mask)
            #action =np.argmax(action_values.cpu().data.numpy())   # Hint: you may find `np.argmax` useful
        
        return action

    def save2Memory(self, state, action, reward, next_state, done):
        # Save experience to memory and, if ready, sample from memory and
        #  update the network
        self.memory.add(state, action, reward, next_state, done)
        self.counter += 1  # Increment step counter

        # Perform learning every 'learn_step' steps if enough experiences are
        #  in memory
        if (self.counter % self.learn_step == 0 and len(self.memory) >= self.batch_size):  # Hint: check if counter is a multiple of learn_step and  memory has enough samples
            experiences= self.memory.sample() # Hint: retrieve a sample of experiences
            self.learn(experiences)

    def learn(self, experiences):
        # Perform a learning step by minimizing the difference between
        #  predicted and target Q-values
        states, actions, rewards, next_states, dones = experiences
        
        ## AGGIUNTA DELLA MASCHERA PER LE AZIONI NON VALIDE ##
        q_target_predictions = self.net_target(next_states).detach()
        # recupero le griglie degli stati successivi
        next_states_grids = next_states[:, :25]
        # Creo la maschera per le azioni valide
        invalid_mask = (next_states_grids != 0)
        # Imposto a -inf i Q-values delle azioni non valide
        q_target_predictions[invalid_mask] = -1e5

        # Compute target Q-values from net_target for stability in training
        q_target = q_target_predictions.max(1)[0].unsqueeze(1)
        """# Compute target Q-values from net_target for stability in training
        q_target = self.net_target(next_states).detach().max(1)[0].unsqueeze(1)"""
        y_j = rewards + (self.gamma * q_target * (1 - dones))
            # Bellman equation for target Q-value
        q_eval = self.net_eval(states).gather(1, actions)
            # Q-value predictions from policy network

        # Compute loss and backpropagate to update net_eval
        loss = self.criterion(q_eval, y_j)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network with soft update for smooth learning
        self.targetUpdate()

    def targetUpdate(self):
        # Soft update to gradually shift target network parameters toward
        #  policy network parameters
        params = zip(self.net_eval.parameters(), self.net_target.parameters())
        for eval_param, target_param in params:
            target_param.data.copy_(
                self.tau  * eval_param.data + (1.0-self.tau) * target_param.data # Hint: use the `self.tau` rate for soft updating
            )

### Funzione di calcolo dello stato dell'ambiente e one hot encoding del dado

In [32]:
def get_obs_state(game):
    """
    Converte lo stato:
    - Griglia: 25 valori normalizzati (0-1)
    - Dado: One-Hot Encoding (vettore di 11 zeri con un 1)
    """
    # 1. Griglia Normalizzata (25 valori)
    grid = game.get_grid().flatten() / 12.0
    
    # 2. Dado One-Hot Encoding (11 valori)
    # I dadi vanno da 2 a 12 (11 possibili risultati)
    dice_val = int(game.get_current_roll())
    
    # Creo un vettore di 11 zeri
    dice_one_hot = np.zeros(11, dtype=np.float32)
    
    # Metto '1' nella posizione giusta
    # Se esce 2 -> indice 0. Se esce 12 -> indice 10.
    if 2 <= dice_val <= 12:
        dice_one_hot[dice_val - 2] = 1.0
        
    # Unisco tutto: 25 (griglia) + 11 (dado) = 36 valori totali
    obs = np.concatenate([grid, dice_one_hot])
    
    return obs.astype(np.float32)

In [33]:
CHECKPOINT_NAME = 'checkpoint.pth'
RECENT_EPISODES = 100  # Number of episodes for average score in early stopping
MIN_EPISODES_FOR_STOP = 100  # Ensures enough episodes before evaluating target

def train(
        env, agent, n_episodes, max_steps,
        eps_start, eps_end, eps_decay,
        target_score, do_store_checkpoint
):
    # Initialize score history and epsilon (exploration rate)
    game_score_hist = []
    reward_hist = []
    epsilon_hist = []
    epsilon = eps_start

    # Progress bar format for tracking training progress
    bar_format = '{l_bar}{bar:10}| {n:4}/{total_fmt}'\
                 ' [{elapsed:>7}<{remaining:>7}, {rate_fmt}{postfix}]'
    pbar = trange(n_episodes, unit="ep", bar_format=bar_format, ascii=True)

    for idx_epi in pbar:
        # Reset the environment for a new episode
        env.new_game()
        state = get_obs_state(env)
        score = 0.0

        for idx_step in range(max_steps):
            # Select an action based on the current policy (epsilon-greedy)
            action = agent.getAction(state,env.get_available_actions(), epsilon)  # Hint: we have something ready for this

            #reward_shaping = calculate_potential(env.get_grid())
            
            # Execute the chosen action in the environment            
            env.choose_action(action)

            #cambio di stato
            next_state = get_obs_state(env)
            
            #creo la reward totale sommando il reward shaping
            reward = env.get_last_reward() 
            
            # Check if the episode is finished
            done = env.has_finished()
            
            # Store experience in memory and update the agent
            agent.save2Memory(state,action,reward,next_state,done)  # Hint: what data do we store as "experience"? How we do it?
            state = next_state  # Move to the next state
            
            score += reward  # Hint: what's our total cumulative score?
            # Check if the episode is finished
            if done:
                break
        
        # Track scores and decay epsilon for less exploration over time
        #salva lo score di ogni episodio per vederlo nel grafico
        game_score = env.get_total_reward()
        game_score_hist.append(game_score)
        epsilon_hist.append(epsilon)
        score_avg = np.mean(game_score_hist[-RECENT_EPISODES:])
        epsilon =max(epsilon - eps_decay,eps_end)  # Hint: epsilon decreases, but we have a minimum 
        """"per la epsiolon"""

        # Update the progress bar with the current score and average
        pbar.set_postfix_str(
            f"Score: {score: 7.2f}, 100 score avg: {score_avg: 7.2f}, Eps: {epsilon: .4f}"
        )
        pbar.update(0)

        # Clear GPU memory periodically to prevent memory issues
        if idx_epi % 1000 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            
        # Early stopping condition if target score is achieved
        if len(game_score_hist) >= 100 and score_avg >= target_score:
            print("\nTarget Reached!")
            break

    # Print completion message based on early stopping or max episodes
    if (idx_epi + 1) < n_episodes:
        print("\nTraining complete - target reached!")
    else:
        print("\nTraining complete - maximum episodes reached.")

    # Save the trained model if specified
    if do_store_checkpoint:
        torch.save(agent.net_eval.state_dict(), CHECKPOINT_NAME)

    return game_score_hist, epsilon_hist

In [34]:
def plotReward(rewards):
    # Plot the agent's reward history to visualize learning progress
    plt.figure()
    plt.plot(rewards)
    plt.title("Reward History")
    plt.xlabel("Episodes")
    plt.ylabel("Reward")
    plt.show()

In [35]:
def plotScore(scores):
    # Plot the agent's score history to visualize learning progress
    plt.figure()
    plt.plot(scores)
    plt.title("Score History")
    plt.xlabel("Episodes")
    plt.ylabel("Score")
    plt.show()

In [36]:
def plotEpsilon(epsilons):
    # Plot the agent's epsilon history to visualize exploration rate changes
    plt.figure()
    plt.plot(epsilons)
    plt.title("Epsilon History")
    plt.xlabel("Episodes")
    plt.ylabel("Epsilon")
    plt.show()

In [37]:
def plotAll(scores, epsilons):
    # Plot reward, score, and epsilon histories in a single figure with subplots
    plt.figure(figsize=(12, 8))

    plt.subplot(2, 1, 1)
    plt.plot(scores)
    plt.title("Score History")
    plt.xlabel("Episodes")
    plt.ylabel("Score")

    plt.subplot(2, 1, 2)
    plt.plot(epsilons)
    plt.title("Epsilon History")
    plt.xlabel("Episodes")
    plt.ylabel("Epsilon")
    
    plt.tight_layout()
    plt.show()

## **Section 4 - Time to learn (training)**

In [38]:
BATCH_SIZE = 64         # Number of experiences sampled per learning step
LR = 1e-3                # Learning rate for optimizer
EPISODES = 200000        # Maximum number of episodes to train (~1 or 2 minute for 100 episodes)
TARGET_SCORE = 100        # Early stop if average score reaches this value
GAMMA = 0.99             # Discount factor for future rewards
MEMORY_SIZE = 100000      # Maximum capacity of replay memory
LEARN_STEP = 10          # Frequency (in steps) of learning updates
TAU = 1e-3               # Soft update rate for the target network
SAVE_CHKPT = True       # Option to save trained model checkpoint
#Exploration parameters
MAX_STEPS = 25           # Maximum steps per episode
EPS_START = 1             # Initial epsilon for exploration (100% exploration at start)
EPS_END = 0.05            # Minimum epsilon (final exploration rate)
# Epsilon decay rate (controls exploration reduction)
EPS_DECAY = (EPS_START - EPS_END) / 50000 # così dopo circa 120000 episodi arriva a EPS_END

In [None]:
# 1. Crea l'ambiente
env = KnisterGame()

# 2. Definisci dimensioni stato e azione
# Input: 25 caselle + 11 valori dado (one-hot) = 36
n_states = 25+11 
# Output: 25 possibili posizioni dove scrivere
n_actions = 25 

# 3. Inizializza l'Agente
agent = DQN(
    n_states=n_states,
    n_actions=n_actions,
    batch_size=BATCH_SIZE,
    learning_rate=LR, 
    mem_size=MEMORY_SIZE,
    gamma=GAMMA,
    learn_step=LEARN_STEP,
    tau=TAU
)

# 4. Avvia il Training (passando il decadimento lineare corretto)
game_score_hist, reward_hist,epsilon_hist = train(env, agent, n_episodes=EPISODES, max_steps=MAX_STEPS,
                   eps_start=EPS_START, eps_end=EPS_END, eps_decay=EPS_DECAY,
                   target_score=TARGET_SCORE, do_store_checkpoint=SAVE_CHKPT)
# 5. Plotta i risultati
plotAll(game_score_hist, epsilon_hist)
# Free up GPU memory if using CUDA
if str(device) == "cuda":
    torch.cuda.empty_cache()

100%|##########| 200000/200000 [1:14:54<  00:00, 44.50ep/s, Score:   44.00, 100 score avg:   37.06, Eps:  0.0500]


Training complete - maximum episodes reached.





ValueError: not enough values to unpack (expected 3, got 2)

TRAINING

Test veloce per vedere quanto fa di media il gioco

In [40]:
def test_knister_agent_fast(env, agent,num_tests, max_steps):
    print("--- INIZIO PARTITA DI TEST ---")
    state = get_obs_state(env)
    step = 1
    average_score = []
    for i in range(num_tests):
        env.new_game()
        state = get_obs_state(env)  
        for step in range(max_steps):#episodio
            # Nessuna mossa casuale usa solo quello che ha imparato
            action = agent.getAction(state,env.get_available_actions(), epsilon=0.0)
            # Esegue la mossa
            env.choose_action(action)
            #cambio di stato
            state = get_obs_state(env)
            # controllo fine partita
            done = env.has_finished()
            step += 1
            if done:
                break    
        average_score.append(env.get_total_reward())
        #print(f"Punteggio Finale: {env.game.get_total_reward()}")
    print(f"Punteggio Medio: {sum(average_score) / len(average_score) if average_score else 0}")

In [41]:
NUM_TEST_EPISODES = 500   # Number of episodes to test the agent
MAX_STEPS_TEST = 25    # Maximum steps per test episode
if SAVE_CHKPT:
    agent.net_eval.load_state_dict(torch.load(CHECKPOINT_NAME))
test_knister_agent_fast(env, agent,NUM_TEST_EPISODES, MAX_STEPS_TEST)

--- INIZIO PARTITA DI TEST ---
Punteggio Medio: 39.982


Test con passi per vedere cosa fa il modello

In [None]:
def test_knister_agent(env, agent,num_tests, max_steps):
    print("--- INIZIO PARTITA DI TEST ---")
    state = get_obs_state(env)
    step = 1
    average_score = []
    for i in range(num_tests):
        env.new_game()
        state = get_obs_state(env)  
        for step in range(max_steps):#episodio
            # Nessuna mossa casuale usa solo quello che ha imparato
            action = agent.getAction(state,env.get_available_actions(), epsilon=0.0)
            # Esegue la mossa
            env.choose_action(action)
            #cambio di stato
            state = get_obs_state(env)
            # controllo fine partita
            done = env.has_finished()
            
            # Recuperiamo info dal gioco reale per visualizzare
            current_grid = env.get_grid()
            dice_value = env.get_current_roll()
            score = env.get_total_reward()
            
            print(f"\nStep {step}")
            print(f"Dado uscito: {dice_value}")
            print(f"L'agente ha scelto la posizione: {action} (Riga {action//5}, Col {action%5})")
            print("Griglia attuale:")
            print(current_grid)
            print(f"Punteggio attuale: {score}")
            
            step += 1
            if done:
                break    
        average_score.append(env.get_total_reward())
        #print(f"Punteggio Finale: {env.game.get_total_reward()}")
    print(f"Punteggio Medio: {sum(average_score) / len(average_score) if average_score else 0}")

In [None]:
NUM_TEST_EPISODES = 1   # Number of episodes to test the agent
MAX_STEPS_TEST = 25    # Maximum steps per test episode
if SAVE_CHKPT:
    agent.net_eval.load_state_dict(torch.load(CHECKPOINT_NAME))
test_knister_agent(env, agent,NUM_TEST_EPISODES, MAX_STEPS_TEST)

--- INIZIO PARTITA DI TEST ---

Step 0
Dado uscito: 2
L'agente ha scelto la posizione: 12 (Riga 2, Col 2)
Griglia attuale:
[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 7 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
Punteggio attuale: 0

Step 1
Dado uscito: 8
L'agente ha scelto la posizione: 21 (Riga 4, Col 1)
Griglia attuale:
[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 7 0 0]
 [0 0 0 0 0]
 [0 2 0 0 0]]
Punteggio attuale: 0

Step 2
Dado uscito: 3
L'agente ha scelto la posizione: 8 (Riga 1, Col 3)
Griglia attuale:
[[0 0 0 0 0]
 [0 0 0 8 0]
 [0 0 7 0 0]
 [0 0 0 0 0]
 [0 2 0 0 0]]
Punteggio attuale: 0

Step 3
Dado uscito: 8
L'agente ha scelto la posizione: 3 (Riga 0, Col 3)
Griglia attuale:
[[0 0 0 3 0]
 [0 0 0 8 0]
 [0 0 7 0 0]
 [0 0 0 0 0]
 [0 2 0 0 0]]
Punteggio attuale: 0

Step 4
Dado uscito: 6
L'agente ha scelto la posizione: 20 (Riga 4, Col 0)
Griglia attuale:
[[0 0 0 3 0]
 [0 0 0 8 0]
 [0 0 7 0 0]
 [0 0 0 0 0]
 [8 2 0 0 0]]
Punteggio attuale: 2

Step 5
Dado uscito: 3
L'agente ha scelto la posizione: 6 (Riga 1, Col 1)
Gr