<a href="https://colab.research.google.com/github/jangyoujin0917/CS377_Project/blob/main/Team02_VRAIL/code/Effect_of_shaped_reward_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Colab 사용을 위한 실행 가이드
============================

1️⃣ Colab에서 실행 준비
------------------------
# 디렉토리 생성 및 설정 파일 업로드
!mkdir instruction
touch instr.txt
# instr.txt 파일을 Colab에 업로드

2️⃣ instr.txt 예시
------------------------
# 파일 경로: instruction/instr.txt

train_bilevel_vrail
cycles=20
epochs=100
gamma=0.99

alpha=0.
alpha_schedule_mode=None     # 또는 'linear', 'exp', 'log', 'sigmoid'
alpha_start=1

epsilon=1.0
epsilon_decay=0.995
min_epsilon=0.01

lr=0.001
batch_size=64
memory_size=50000


train_q_with_fixed_w
print_every=10

train_w_supervised
epochs=50
lr=0.01

settings
SEED=0


3️⃣ 실행 예시
------------------------
# 학습 및 결과 저장
train_bilevel_vrail_from_instruction(save_path="results")


4️⃣ 결과 저장 구조
------------------------
results/
└── [디렉토리 이름 형식]

    -  alpha_schedule_mode이 "None"인 경우:
        2025-XX-XX_HH-MM-SS_alpha{alpha}_seed{SEED}/

    -  alpha_schedule_mode이 "None"이 아닌 경우:
        2025-XX-XX_HH-MM-SS_seed{SEED}/

    📂 디렉토리 내용:
    ├── analysis.txt         # 학습 성능 분석
    ├── model.pth            # 학습된 Q-network 모델
    ├── reward_plot.png      # 보상 그래프
    ├── settings.txt         # 사용된 설정 복사본
    ├── weights.csv          # 학습된 shaping weight (w)
    └── rewards.csv          # 각 에피소드별 보상 기록
"""
print()




In [None]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

import shutil, csv, pickle, random
from datetime import datetime
import torch, numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from collections import deque, defaultdict

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_w_from_csv(seed, base_dir="D:/D-programming/python/CS377_project/test/result_DQN"):
    # seed에 맞는 폴더 탐색
    for folder in os.listdir(base_dir):
        if f"seed{seed}" in folder:
            weight_path = os.path.join(base_dir, folder, "weights.csv")
            if os.path.isfile(weight_path):
                print(f"📂 w loaded from: {weight_path}")
                w_array = np.loadtxt(weight_path, delimiter=",", skiprows=1)
                if w_array.ndim == 2 and w_array.shape[0] == 1:
                    w_array = w_array.squeeze(0)
                return torch.tensor(w_array, dtype=torch.float32).to(device)
    raise FileNotFoundError(f"❌ weights.csv not found for seed {seed} in {base_dir}")


# === 1. 설정 파싱 및 디바이스 초기화 ===
def parse_instruction_file(path):
    parsed = defaultdict(dict)
    current_section = "default"  # 기본 섹션 이름 설정

    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue  # 주석과 공백 줄 무시

            # Section 이름 처리 (예: train_bilevel_vrail)
            if "=" not in line:
                current_section = line
                continue

            # key=value 파싱
            key, val = line.split("=", 1)
            key = key.strip()
            val = val.strip()

            # 타입 자동 변환
            if val.lower() == "true":
                val = True
            elif val.lower() == "false":
                val = False
            else:
                try:
                    if "." in val:
                        val = float(val)
                    else:
                        val = int(val)
                except ValueError:
                    pass  # 문자열로 유지

            parsed[current_section][key] = val

    return parsed

def initialize(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    return seed

# === 2. 모델 및 유틸리티 ===
class VRAILModel(nn.Module):
    def __init__(self, state_size, action_size):
        super().__init__()
        self.structure = nn.Sequential(
            nn.Linear(state_size, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, action_size)
        )
    def forward(self, x): return self.structure(x)

class ReplayBuffer:
    def __init__(self, capacity, allow_push=True):
        self.buffer = deque(maxlen=capacity)
        self.allow_push = allow_push
    def push(self, *args):
        if self.allow_push: self.buffer.append(args)
    def sample(self, batch_size): return zip(*random.sample(self.buffer, batch_size))
    def __len__(self): return len(self.buffer)
    def freeze(self): self.allow_push = False
    def unfreeze(self): self.allow_push = True

# === 3. 환경 처리 및 훈련 ===
def make_env(seed):
    env = gym.make("Taxi-v3")
    env.reset(seed=seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env

def preprocess_state_with_walls(state, env):
    row, col, pass_loc, dest_idx = env.unwrapped.decode(state)
    onehot = lambda x, size: np.eye(size)[x]
    row_onehot = onehot(row, 5); col_onehot = onehot(col, 5)
    passloc_onehot = onehot(pass_loc, 5); dest_onehot = onehot(dest_idx, 4)
    return np.concatenate([row_onehot, col_onehot, passloc_onehot, dest_onehot])

# mode : None, linear, exp, log, sigmoid
def compute_scheduled_alpha(cycle, total, alpha_start, alpha_target, mode = None):
    p = cycle / (total - 1)
    if mode == "linear": return alpha_start + (alpha_target - alpha_start) * p
    if mode == "exp": return alpha_start * ((alpha_target / alpha_start) ** p)
    if mode == "log": return alpha_start + (alpha_target - alpha_start) * (np.log1p(99 * p) / np.log1p(99))
    if mode == "sigmoid":
      K = 10 # default
      # K = 0.5
      return alpha_start + (alpha_target - alpha_start) * (1 / (1 + np.exp(-K * (p - 0.5))))
    if mode == "user-ftn1":
      p_start=0.2
      p_end=0.7
      if p < p_start:
          return alpha_start
      elif p > p_end:
          return alpha_target
      else:
          # 선형 보간
          ratio = (p - p_start) / (p_end - p_start)
          return alpha_start + (alpha_target - alpha_start) * ratio

    if mode == "user-ftn2":
        p_start = 0.1
        p_end = 0.7
        if p < p_start:
            return alpha_start
        elif p > p_end:
            return alpha_target
        else:
            # 중간 sigmoid 구간
            K = 10  # 기울기 조절
            # p → [0, 1] 로 다시 정규화
            norm_p = (p - p_start) / (p_end - p_start)
            sigmoid_val = 1 / (1 + np.exp(-K * (norm_p - 0.5)))
            return alpha_start + (alpha_target - alpha_start) * sigmoid_val

    return alpha_target

def train_q_with_fixed_w(model, target_net, w, episodes, alpha, gamma, epsilon, epsilon_decay,
                         min_epsilon, optimizer, memory, batch_size, env, rewards,
                         feature_value_data, print_every=10):
    for episode in range(episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            state_vec = preprocess_state_with_walls(state, env)
            state_tensor = torch.from_numpy(state_vec.astype(np.float32)).unsqueeze(0).to(device)

            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    q_vals = model(state_tensor)
                action = q_vals.argmax().item()

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

            memory.push(state, action, reward, next_state, done)
            state = next_state

            if len(memory) >= batch_size:
                s_batch, a_batch, r_batch, ns_batch, d_batch = memory.sample(batch_size)

                s_array = np.array([preprocess_state_with_walls(s, env) for s in s_batch], dtype=np.float32)
                s_tensor = torch.from_numpy(s_array).to(device)
                a_tensor = torch.tensor(a_batch, dtype=torch.int64).unsqueeze(1).to(device)
                r_tensor = torch.tensor(r_batch, dtype=torch.float32).unsqueeze(1).to(device)
                ns_array = np.array([preprocess_state_with_walls(ns, env) for ns in ns_batch], dtype=np.float32)
                ns_tensor = torch.from_numpy(ns_array).to(device)
                d_tensor = torch.tensor(d_batch, dtype=torch.float32).unsqueeze(1).to(device)

                q_values = model(s_tensor)
                q_selected = q_values.gather(1, a_tensor)

                with torch.no_grad():
                    q_next = target_net(ns_tensor)
                    max_q_next = q_next.max(1)[0].unsqueeze(1)
                    # shaping_bonus = alpha * nn.Tanh()(torch.matmul(s_tensor, w).unsqueeze(1))
                    # shaping_bonus = (alpha * torch.tanh(s_tensor * w).mean(dim=1, keepdim=True)).detach()
                    # shaping_bonus = (alpha * torch.tanh(0.085 * s_tensor * w).mean(dim=1, keepdim=True)).detach()
                    shaping_bonus = (gamma * (ns_tensor * w) - (s_tensor * w)).sum(dim=1).detach() # torch.Size([64, 23]) -> torch.Size([64])
                    # print(shaping_bonus.shape)
                    # dimension
                    # shaping_bonus = 0

                    shaped_reward = r_tensor + shaping_bonus

                target = shaped_reward + gamma * max_q_next * (1 - d_tensor)
                loss = nn.MSELoss()(q_selected, target)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                with torch.no_grad():
                    q_vals_all = model(s_tensor)
                    v_vals = q_vals_all.max(1)[0]
                    for f, v in zip(s_tensor, v_vals):
                        feature_value_data.append((f.cpu().numpy(), v.item()))

        rewards.append(total_reward)
        epsilon = max(min_epsilon, epsilon * epsilon_decay)

        if episode % print_every == 0 or episode == episodes - 1:
            target_net.load_state_dict(model.state_dict())
            print(f"  [Q] Episode {episode:4d} | Reward: {total_reward:4d} | Epsilon: {epsilon:.3f}")
    return epsilon


def train_w_supervised(feature_value_data, w, epochs=100, lr=0.01):
    features = np.array([f for f, _ in feature_value_data], dtype=np.float32)
    values = np.array([v for _, v in feature_value_data], dtype=np.float32).reshape(-1, 1)

    # 개수 출력
    # print(f"feature_value_data {feature_value_data}")
    print(f"🧮 Training samples: {features.shape[0]} feature vectors, {values.shape[0]} value entries")

    X = torch.from_numpy(features).to(device)
    y = torch.from_numpy(values).to(device)

    w = nn.Parameter(w.clone().detach().unsqueeze(1), requires_grad=True)
    optimizer = optim.Adam([w], lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        pred = X @ w
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return w.detach().squeeze()


def moving_average(data, window=50):
    return np.convolve(data, np.ones(window) / window, mode='valid')

def increasing_alpha(mode=None):
  modes = ["linear", "exp", "log", "sigmoid", "sigmoid-tan", "user-ftn1", "user-ftn2"]
  if None: return false
  elif mode in modes: return mode
  else: RuntimeError("Invalid alpha_schedule_mode")

def train_bilevel_vrail_from_instruction(instr_path="instruction/instr.txt", save_path="instruction"):
    # 1. 설정 파일 파싱
    parsed = parse_instruction_file(instr_path)
    cfg = parsed["train_bilevel_vrail"]
    seed_cfg = parsed.get("settings", {})

    # 2. 설정 값 캐싱 (딕셔너리 접근 최소화)
    print_every      = parsed.get("train_q_with_fixed_w", {}).get("print_every", 10)
    linear_epochs    = parsed.get("train_w_supervised", {}).get("epochs", 50)
    linear_lr        = parsed.get("train_w_supervised", {}).get("lr", 0.01)
    SEED             = int(seed_cfg.get("SEED", 42))
    initialize(seed=SEED)

    # train_bilevel_vrail config (단축 변수 선언)
    lr               = cfg["lr"]
    memory_size      = cfg["memory_size"]
    cycles           = cfg["cycles"]
    q_train_episodes = cfg["epochs"]
    epsilon          = cfg["epsilon"]
    epsilon_decay    = cfg["epsilon_decay"]
    min_epsilon      = cfg["min_epsilon"]
    batch_size       = cfg["batch_size"]
    gamma            = cfg["gamma"]
    alpha_start      = cfg["alpha_start"]
    alpha_target     = cfg["alpha"]
    alpha_mode       = cfg["alpha_schedule_mode"] # None

    # 3. 환경 및 모델 초기화
    env = make_env(SEED)
    # state_size, action_size = 23, env.action_space.n
    state_size, action_size = 19, env.action_space.n

    model = VRAILModel(state_size, action_size).to(device)
    target_net = VRAILModel(state_size, action_size).to(device)

    target_net.load_state_dict(model.state_dict())
    target_net.eval()

    optimizer = optim.Adam(model.parameters(), lr=lr)
    memory = ReplayBuffer(memory_size)
    # w = torch.randn(state_size).to(device)
    # w = torch.zeros(state_size).to(device)
    try:
        w = load_w_from_csv(SEED)
    except FileNotFoundError:
        print("⚠️ weights.csv not found. Initializing w as zeros.")
        w = torch.zeros(state_size).to(device)

    rewards = []

    # 4. 학습 루프
    for cycle in range(cycles):
        feature_value_data = []
        # feature_value_data = {}

        eps = epsilon * (epsilon_decay ** (q_train_episodes * cycle))

        alpha = compute_scheduled_alpha(cycle, cycles, alpha_start, alpha_target, mode = alpha_mode)

        if cycle == 0: alpha = 0.

        print(f"\n🔁 Cycle {cycle+1}/{cycles} | Alpha: {alpha:.6f}")

        # Q-network 학습
        train_q_with_fixed_w(
            model, target_net, w, q_train_episodes, alpha, gamma,
            eps, epsilon_decay, min_epsilon, optimizer, memory,
            batch_size, env, rewards, feature_value_data,
            print_every
        )

        # we use the trained w from Linear VRAIL
        # 보상 weight 학습
        # w = train_w_supervised(feature_value_data, w, linear_epochs, linear_lr)
        target_net.load_state_dict(model.state_dict())

    # 5. 저장 및 시각화
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # latest model does not use alpha mode
    # if increasing_alpha(mode = alpha_mode):
    #   folder_name = f"{timestamp}_seed{SEED}"
    # else:
    #   folder_name = f"{timestamp}_alpha{alpha}_seed{SEED}"
    folder_name = f"{timestamp}_thm_seed{SEED}"

    save_dir = os.path.join(save_path, folder_name)
    os.makedirs(save_dir, exist_ok=True)

    # model 저장 (PTH)
    torch.save(model.state_dict(), os.path.join(save_dir, "model.pth"))

    # weights 저장 (CSV)
    weight_file = os.path.join(save_dir, "weights.csv")
    if w.ndim == 1:
        np.savetxt(weight_file,
                  w.cpu().reshape(1, -1).numpy(),
                  delimiter=",",
                  header=",".join([f"w{i}" for i in range(len(w))]),
                  comments='',
                  fmt="%.4f")
    else:
        np.savetxt(weight_file,
                  w.cpu().numpy(),
                  delimiter=",",
                  header=",".join([f"w{i}" for i in range(w.shape[1])]),
                  comments='',
                  fmt="%.4f")

    # rewards 저장 (CSV)
    np.savetxt(os.path.join(save_dir, "rewards.csv"), rewards, delimiter=",", header="reward", comments='', fmt="%.1f")

    # settings 저장 (TXT)
    shutil.copy(instr_path, os.path.join(save_dir, f"settings.txt"))

    # analysis 저장 (TXT)
    analysis_path = os.path.join(save_dir, "analysis.txt")
    with open(analysis_path, "w") as f:
        f.write("📊 VRAIL 학습 분석\n")
        f.write("=" * 40 + "\n\n")

        # 1. Threshold 도달 에피소드
        R = moving_average(rewards)
        thresholds = [-10, -5, 0, 5]
        idx = lambda arr, t: np.argmax(np.array(arr) >= t)
        f.write("🎯 Threshold 도달 에피소드 (Moving Avg 기준)\n")
        for t in thresholds:
            step = int(idx(R, t)) + 50 if np.any(np.array(R) >= t) else -1
            f.write(f"  - ≥ {t:+} 도달: {step if step != -1 else '도달 실패'} 에피소드\n")
        f.write("\n")

        # 2. 최종 및 최대 평균 보상
        f.write(f"🏁 마지막 Moving Avg: {R[-1]:.2f}\n")
        f.write(f"⭐ 최대 Moving Avg: {max(R):.2f}\n\n")


    plt.plot(rewards, label="Reward")
    plt.plot(moving_average(rewards), label="Moving Avg (50)")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title(f"Bi-level VRAIL (seed={SEED})")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(save_dir, "reward_plot.png"))
    plt.close()

    print(f"✅ 저장 완료: {save_dir}")

In [None]:
instructions = ["instr0.txt", "instr1.txt", "instr2.txt",  "instr3.txt", "instr4.txt"]
# instructions = ["instr4.txt"]
for instr in instructions:
    instr_path = os.path.join("instruction", instr)
    train_bilevel_vrail_from_instruction(instr_path, save_path="result_DQN_with_weight")
print()

📂 w loaded from: D:/D-programming/python/CS377_project/test/result_DQN\2025-05-31_19-58-34_thm_seed0\weights.csv

🔁 Cycle 1/20 | Alpha: 0.000000


  return F.mse_loss(input, target, reduction=self.reduction)


  [Q] Episode    0 | Reward: -875 | Epsilon: 0.995
  [Q] Episode   10 | Reward: -650 | Epsilon: 0.946
  [Q] Episode   20 | Reward: -677 | Epsilon: 0.900
  [Q] Episode   30 | Reward: -659 | Epsilon: 0.856
  [Q] Episode   40 | Reward: -740 | Epsilon: 0.814
  [Q] Episode   50 | Reward: -695 | Epsilon: 0.774
  [Q] Episode   60 | Reward: -506 | Epsilon: 0.737
  [Q] Episode   70 | Reward: -587 | Epsilon: 0.701
  [Q] Episode   80 | Reward: -461 | Epsilon: 0.666
  [Q] Episode   90 | Reward: -587 | Epsilon: 0.634
  [Q] Episode   99 | Reward: -479 | Epsilon: 0.606

🔁 Cycle 2/20 | Alpha: 0.000000
  [Q] Episode    0 | Reward: -569 | Epsilon: 0.603
  [Q] Episode   10 | Reward: -479 | Epsilon: 0.573
  [Q] Episode   20 | Reward: -371 | Epsilon: 0.545
  [Q] Episode   30 | Reward:  -31 | Epsilon: 0.519
  [Q] Episode   40 | Reward: -560 | Epsilon: 0.493
  [Q] Episode   50 | Reward: -216 | Epsilon: 0.469
  [Q] Episode   60 | Reward: -452 | Epsilon: 0.446
  [Q] Episode   70 | Reward: -461 | Epsilon: 0.424