In [None]:
import serial
import json
import numpy as np
import time
import os
from random import choice

# =======================================================
#  üî•  CONFIGURACI√ìN INICIAL
# =======================================================
NUM_ACTIONS = 4   # 0=avanzar, 1=izquierda, 2=derecha, 3=retroceder

# --- cargar Q-table ---
if os.path.exists("qtablev2.npy"):
    Q = np.load("qtablev2.npy")
    print("Q-table cargada desde archivo.")
    if Q.shape != (8, NUM_ACTIONS):
        print("Redimensionando Q-table para 4 acciones...")
        Q2 = np.zeros((8, NUM_ACTIONS))
        Q2[:, :Q.shape[1]] = Q
        Q = Q2
else:
    Q = np.zeros((8, NUM_ACTIONS))
    print("Q-table nueva creada.")

ser = serial.Serial('/dev/ttyUSB0', 115200, timeout=1)

ACTIONS = [0, 1, 2, 3]  
alpha = 0.3
gamma = 0.9
epsilon = 0.25

# =======================================================
#  üîÑ  LECTURA DE ESTADO DESDE ESP32
# =======================================================
def get_state():
    try:
        line = ser.readline().decode().strip()
        if not line:
            return None
        data = json.loads(line)
        return np.array([data["L"], data["C"], data["R"]], dtype=int)
    except:
        return None

# =======================================================
#  üöó  ENV√çO DE ACCI√ìN AL ESP32
# =======================================================
def send_action(a):
    ser.write((str(a) + "\n").encode())

# =======================================================
#  üéØ  FUNCI√ìN DE RECOMPENSA MEJORADA
# =======================================================
def reward_function(state, action):
    """
    state: np.array([L, C, R])
    action: 0 = adelante, 1 = izquierda, 2 = derecha, 3 = retroceder
    """

    L, C, R = state
    reward = 0

    # Umbrales de distancia (sensores frontales)
    DANGER = 1   # sensor activo = obst√°culo a ~20cm
    SAFE = 0     # sensor libre

    # --- No hay obst√°culos cerca ---
    if C == SAFE:
        if action == 3:
            reward -= 5  # retroceder sin necesidad penaliza
        elif action == 0:
            reward += 2  # avanzar libremente recompensa
        else:
            reward -= 1  # girar sin necesidad penaliza

    # --- Obst√°culo frente ---
    elif C == DANGER:
        if action == 0:
            reward -= 10  # avanzar hacia obst√°culo = castigo
        elif action == 3:
            reward += 3   # retroceder cuando hay obst√°culo = recompensa
        else:
            reward += 1   # girar est√° bien

    # --- Obst√°culo lateral o combinaci√≥n ---
    if (L == DANGER and R == DANGER) and action != 3:
        reward -= 3   # atrapado, retroceder recomendado

    return reward

# =======================================================
#  üìü  PASAR ESTADO A √çNDICE 0‚Äì7
# =======================================================
def state_to_index(s):
    return s[0] * 4 + s[1] * 2 + s[2] * 1

# =======================================================
#  üöÄ  ENVIAR ACCI√ìN INICIAL
# =======================================================
print("Enviando acci√≥n inicial (avanzar = 0)...")
send_action(0)
time.sleep(2)
print("Comenzando entrenamiento...")

last_save = time.time()

# =======================================================
#  üîÅ  BUCLE PRINCIPAL DE REINFORCEMENT LEARNING
# =======================================================
while True:
    state = get_state()
    if state is None:
        continue

    idx = state_to_index(state)

    # epsilon-greedy
    if np.random.rand() < epsilon:
        action = choice(ACTIONS)
    else:
        action = np.argmax(Q[idx])

    send_action(action)
    time.sleep(0.15)

    new_state = get_state()
    if new_state is None:
        continue

    reward = reward_function(new_state, action)
    new_idx = state_to_index(new_state)

    # Q-learning update
    Q[idx, action] = Q[idx, action] + alpha * (
        reward + gamma * np.max(Q[new_idx]) - Q[idx, action]
    )

    print(f"State {state} | Action {action} | Reward {reward:.2f}")

    # Guardado autom√°tico
    if time.time() - last_save > 5:
        np.save("qtablev2.npy", Q)
        print("Q-table guardada.")
        last_save = time.time()


Q-table nueva creada.
Enviando acci√≥n inicial (avanzar = 0)...
Comenzando entrenamiento...
State [0 0 0] | Action 1 | Reward -1.00
State [0 0 0] | Action 0 | Reward 2.00
State [0 0 0] | Action 0 | Reward 2.00
State [0 0 0] | Action 0 | Reward 2.00
State [0 0 0] | Action 2 | Reward -1.00
State [0 0 0] | Action 0 | Reward 2.00
State [0 0 0] | Action 0 | Reward 2.00
State [0 0 0] | Action 0 | Reward 2.00
State [0 0 0] | Action 2 | Reward -1.00
State [0 0 0] | Action 0 | Reward 2.00
State [0 1 0] | Action 0 | Reward -10.00
State [0 1 0] | Action 1 | Reward 1.00
Q-table guardada.
State [0 1 0] | Action 3 | Reward 3.00
State [0 1 0] | Action 3 | Reward 3.00
State [0 1 0] | Action 3 | Reward -5.00
State [0 0 0] | Action 0 | Reward 2.00
State [0 0 0] | Action 0 | Reward 2.00
State [1 1 0] | Action 0 | Reward -10.00
State [0 1 0] | Action 3 | Reward 3.00
State [0 0 0] | Action 0 | Reward 2.00
State [0 1 0] | Action 3 | Reward 3.00
State [0 1 0] | Action 2 | Reward 1.00
State [0 1 0] | Action 3

In [1]:
import numpy as np

Q = np.load("qtablev2.npy")
Q

Q_int = (Q * 1000).astype(int)  # opcional: escalar

with open("qtablev2.h", "w") as f:
    f.write("const int Q[8][4] = {\n")
    for row in Q_int:
        f.write("  {" + ", ".join(map(str, row)) + "},\n")
    f.write("};\n")
