# Modules

In [1]:
import json
import numpy as np

import torch.distributions as dist

import torch
from torch import nn
from torch import optim

# Data Structures

In [2]:
class Category():
    def __init__(self, name, depense:bool):
        self.name = name
        self.ratio = 0.0
        self.actual_ratio = 0.0
        self.user_satisfaction = 0.0
        if depense:
            self.is_depense = True
        else:
            self.is_depense = False
        
    def benefit(self):
        if self.is_depense:
            return self.ratio - self.actual_ratio
        else:
            return self.actual_ratio - self.ratio
        
    def satisafaction(self):
        return self.user_satisfaction
        
    def reset(self):
        self.ratio = 0.0
        self.actual_ratio = 0.0
        self.user_satisfaction = 0.0

In [3]:
class Goal():
    def __init__(self, goal):
        self.goal = goal
        self.money = 0.0
        
    def add(self, cach):
        self.money = cach

    def achievement(self):
        if self.money >= self.goal:
            return 1.0
        else:
            return self.money / self.goal
        
    def reset(self):
        self.money = 0.0

In [4]:
class Investment():
    def __init__(self):
        self.income = 0
        self.outcome = 0
        
    def is_running(self):
        return self.outcome > 0
    
    def profit(self):
        if self.is_running():
            return self.income / self.outcome
        else:
            return 0
        
    def reset(self):
        self.income = 0
        self.outcome = 0

# Framework

In [5]:
class Action():
    def __init__(self, categories_ratios, goal_ratio, investment_ratio):
        self.categories_ratios = categories_ratios
        self.goal_ratio = goal_ratio
        self.investment_ratio = investment_ratio

In [6]:
class Simulation():
    def __init__(self, money, actual_categories_ratios, user_satisfactions, investment_icome):
        self.actual_categories_ratios = actual_categories_ratios
        self.user_satisfactions = user_satisfactions
        self.investment_income = investment_icome
        self.money = money

In [7]:
class Env():
  def __init__(self, initial_money, categories: list[Category], goal: Goal, investment: Investment):
    self.money = initial_money
    self.categories = categories
    self.goal = goal
    self.investment = investment
    
  def add(self, cach_in):
    self.money += cach_in
    
  def observation(self):
    return np.array([self.money] + [c.actual_ratio for c in self.categories] + [c.user_satisfaction for c in self.categories] + [self.goal.achievement(), self.investment.profit()])
  
  def update(self, action: Action):
    for i in range(len(self.categories)):
      self.categories[i].ratio = action.categories_ratios[i]  
    self.goal.add(action.goal_ratio * self.money)
    self.investment.outcome += action.investment_ratio * self.money
    
  def simulate(self, simulation: Simulation):
    for i, category in enumerate(self.categories):
      category.actual_ratio = simulation.actual_categories_ratios[i]
      category.user_satisfaction = simulation.user_satisfactions[i]
    self.investment.income += simulation.investment_income
    
  def reward(self):
    return sum([c.benefit() for c in self.categories] + [c.satisafaction() for c in self.categories] + [self.goal.achievement(), self.investment.profit()])

# Utils

In [8]:
def load_simulations(path):
    with open(path) as f:
        sims =  json.load(f)
        
    simulations = []
    for s in sims:
        for k,v in s.items():
            if isinstance(v, list):
                s[k] = [float(x) for x in v]
            else:
                s[k] = float(v)
        simulations.append(Simulation(**s))
    return simulations

def get_simulation(simulations, iter):
    return simulations[iter]

In [10]:
def get_action(action):
    return Action(categories_ratios=action[:5], goal_ratio=action[5], investment_ratio=action[6])

In [11]:
def to_tensor(obs, acts, rewards, log_probs, values, advantages):
    return torch.tensor(obs, dtype=torch.float), torch.tensor(acts, dtype=torch.float), torch.tensor(rewards, dtype=torch.float), torch.tensor(log_probs, dtype=torch.float), torch.tensor(values, dtype=torch.float), torch.tensor(advantages, dtype=torch.float)

In [33]:
def get_batches(data, batch_size):
    batches = []
    for i in range(0, len(data), batch_size):
        batches.append(data[i:i+batch_size])
    return batches

# Simulation

In [9]:
Simulations = load_simulations("../data/simulations.json")

In [10]:
investment = Investment()
goal = Goal(120)

billing = Category("Billing", True)
dept = Category("Dept", True)
purchase = Category("Purchase", True)
entertainment = Category("Entertainment", True)
saving = Category("Saving", False)

categories = [billing, dept, purchase, entertainment, saving]

action = Action([0.1, 0.1, 0.1, 0.1, 0.2], 0.1, 0.3)

env = Env(200, categories, goal, investment)

In [11]:
for i in range(len(Simulations)):
    # print("OBSERVATION", [round(x, 2) for x in env.observation()])
    env.update(action)
    env.simulate(get_simulation(Simulations, i))
    # print("OBSERVATION", [round(x, 2) for x in env.observation()])
    # print("REWARD", round(env.reward(), 2))

# PPO

In [12]:
class ActorCriticNetwork(nn.Module):
    def __init__(self, obs_space_size, action_space_size):
        super().__init__()

        self.shared_layers = nn.Sequential(
            nn.Linear(obs_space_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU())

        self.policy_mean = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, action_space_size))  # Output layer for the mean of the action distribution

        self.policy_std = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, action_space_size))  # Output layer for the standard deviation of the action distribution

        self.value_layers = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1))

    def value(self, obs):
        z = self.shared_layers(obs)
        value = self.value_layers(z)
        return value

    def policy(self, obs):
        z = self.shared_layers(obs)
        mean = self.policy_mean(z)
        std = torch.exp(self.policy_std(z))  # Using the exponential function to ensure std is positive
        return mean, std

    def forward(self, obs):
        z = self.shared_layers(obs)
        mean = self.policy_mean(z)
        std = torch.exp(self.policy_std(z))
        value = self.value_layers(z)
        return mean, std, value

In [13]:
class PolicyNetwork:
    def sample(self, ac_network, obs_tensor):
        mean, std = ac_network.policy(obs_tensor)
        action = torch.normal(mean, std)
        log_prob = torch.distributions.Normal(mean, std).log_prob(action)
        return action, log_prob

In [14]:
class PPOTrainer():
    def __init__(self,
                actor_critic,
                ppo_clip_val=0.2,
                target_kl_div=0.02,
                max_policy_train_iters=4,
                value_train_iters=4,
                policy_lr=1e-5,
                value_lr=1e-5):
        self.ac = actor_critic
        self.ppo_clip_val = ppo_clip_val
        self.target_kl_div = target_kl_div
        self.max_policy_train_iters = max_policy_train_iters
        self.value_train_iters = value_train_iters

        policy_params = list(self.ac.shared_layers.parameters()) + \
            list(self.ac.policy_mean.parameters()) + list(self.ac.policy_std.parameters())
        self.policy_optim = optim.Adam(policy_params, lr=policy_lr)

        value_params = list(self.ac.shared_layers.parameters()) + \
            list(self.ac.value_layers.parameters())
        self.value_optim = optim.Adam(value_params, lr=value_lr)

    def train_policy(self, obs, acts, old_log_probs, gaes):
        for _ in range(self.max_policy_train_iters):
            self.policy_optim.zero_grad()

            # 1. Compute new action logits and log probabilities
            new_mean, new_std = self.ac.policy(obs)
            new_dist = dist.Normal(new_mean, new_std)
            new_log_probs = new_dist.log_prob(acts)

            # 2. Compute the policy ratio and apply clipping
            policy_ratio = torch.exp(new_log_probs - old_log_probs)
            clipped_ratio = policy_ratio.clamp(1 - self.ppo_clip_val, 1 + self.ppo_clip_val)
            
            # 3. Compute PPO loss
            # Make sure tensors have the same shape
            clipped_ratio = clipped_ratio.unsqueeze(1)
            gaes = gaes.unsqueeze(1) 
            policy_ratio = policy_ratio.unsqueeze(1)
            
            clipped_loss = torch.min(clipped_ratio * gaes, policy_ratio * gaes)
            policy_loss = -clipped_loss.mean()

            # 4. Backpropagate and update the policy network
            policy_loss.backward()
            self.policy_optim.step()

            # 5. Save the new mean and standard deviation as the old values
            old_mean, old_std = new_mean, new_std

            # 6. Calculate the KL divergence between old and new policies
            kl_div = torch.distributions.kl.kl_divergence(
                dist.Normal(old_mean, old_std), new_dist).mean()

            # 7. Check if the KL divergence is within the target range
            if kl_div >= self.target_kl_div:
                break


    def train_value(self, obs, returns):
        for _ in range(self.value_train_iters):
            self.value_optim.zero_grad()

            # 1. Compute predicted values from the value network
            values = self.ac.value(obs)

            # 2. Compute the value loss (mean squared error)
            value_loss = (torch.tensor(returns, dtype=torch.float) - values) ** 2  
            value_loss = value_loss.mean()

            # 3. Backpropagate and update the value network
            value_loss.backward()
            self.value_optim.step()

In [15]:
def discount_rewards(rewards, gamma=0.99):
    """
    Return discounted rewards based on the given rewards and gamma param.
    """
    new_rewards = [float(rewards[-1])]
    for i in reversed(range(len(rewards)-1)):
        new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])
    return np.array(new_rewards[::-1])

def calculate_gaes(rewards, values, gamma=0.99, decay=0.97):
    """
    Return the General Advantage Estimates from the given rewards and values.
    Paper: https://arxiv.org/pdf/1506.02438.pdf
    """
    next_values = np.concatenate([values[1:], [0]])
    deltas = [rew + gamma * next_val - val for rew, val, next_val in zip(rewards, values, next_values)]

    gaes = [deltas[-1]]
    for i in reversed(range(len(deltas)-1)):
        gaes.append(deltas[i] + decay * gamma * gaes[-1])

    return np.array(gaes[::-1])

# Rollout

In [16]:
def collect_trajectories(env, ac_network, p_network, num_trajectories, simulations):
    all_observations = []
    all_actions = []
    all_rewards = []
    all_log_probs = []
    all_values = []

    for _ in range(num_trajectories):
        observations = []
        actions = []
        rewards = []
        log_probs = []
        values = []

        obs = env.observation()

        for i, simulation in enumerate(simulations):
            # Sample an action from the policy network
            with torch.no_grad():
                obs_tensor = torch.FloatTensor(obs)
                action, log_prob = p_network.sample(ac_network, obs_tensor)
                value = ac_network.value(obs_tensor)

            # Convert the action to a NumPy array
            action_np = action.numpy()
            action = get_action(action)

            env.update(action)
            env.simulate(simulation)
            
            next_obs, reward, = env.observation(), env.reward()

            # Store the results
            observations.append(obs)
            actions.append(action_np)
            rewards.append(reward)
            log_probs.append(log_prob)
            values.append(value)

            obs = next_obs

        all_observations.extend(observations)
        all_actions.extend(actions)
        all_rewards.extend(rewards)
        all_log_probs.extend(log_probs)
        all_values.extend(values)

    return (
        np.array(all_observations),
        np.array(all_actions),
        np.array(all_rewards),
        np.array(all_log_probs),
        np.array(all_values)
    )

# Training

In [95]:
simulations = load_simulations("../data/record.json")
batches = get_batches(simulations, 50)

In [96]:
obs_space_size = 13 
action_space_size = 7  

ac_network = ActorCriticNetwork(obs_space_size, action_space_size)
p_network = PolicyNetwork()
ppo_trainer = PPOTrainer(ac_network, policy_lr=1e-10, value_lr=1e-10)

num_epochs = 10 

num_trajectories = 2
reward_rec = []

In [None]:
for epoch in range(1, num_epochs+1):
    print(f"[EPOCH] {epoch}")
    for i, batch in enumerate(batches):
        obs, acts, rewards, log_probs, values = collect_trajectories(env, ac_network, p_network, num_trajectories, batch)
        returns = discount_rewards(rewards)
        values = values.reshape(-1)
        advantages = calculate_gaes(rewards, values)
        obs, acts, returns, log_probs, values, advantages = to_tensor(obs, acts, returns, log_probs, values, advantages)
        
        ppo_trainer.train_policy(obs, acts, log_probs, advantages)
        ppo_trainer.train_value(obs, returns)
        
        reward_rec.append(rewards.mean())
        if i % 10 == 0:
            print(f"[BATCH] {i} | Reward: {rewards.mean():.2f}")
        
    if epoch % 5 == 0:
        print(f"Epoch: {epoch} | Rewards: {rewards.mean():.2f} | Returns: {returns.mean():.2f} | Advantages: {advantages.mean():.2f}")

# Model Saving

In [101]:
torch.save(ac_network.state_dict(), "ppo_actor_critic.pth")