<a href="https://colab.research.google.com/github/Mufabo/py_inforce/blob/master/tests/Examples_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [1]:
!git clone https://github.com/Mufabo/py_inforce.git
%cd py_inforce
!pip install -e .

Cloning into 'py_inforce'...
remote: Enumerating objects: 128, done.[K
remote: Counting objects: 100% (128/128), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 128 (delta 47), reused 108 (delta 33), pack-reused 0[K
Receiving objects: 100% (128/128), 644.77 KiB | 872.00 KiB/s, done.
Resolving deltas: 100% (47/47), done.
/content/py_inforce
Obtaining file:///content/py_inforce
Installing collected packages: py-inforce
  Running setup.py develop for py-inforce
Successfully installed py-inforce


# Dynamic Programming

## Policy Iteration on FrozenLake

In [0]:
import py_inforce as pin
import gym
import numpy as np

env = gym.make('FrozenLake-v0', is_slippery=True)  

policy = pin.policy_iteration(env, .95, thresh=0.00001)

returns = []

for i in range(100):
    state = env.reset()

    done = False
    ret = 0

    while not done:
        action = np.where(policy[state, :] == 1)
        state, reward, done, _ = env.step(action[0][0])
        ret += reward
    returns.append(ret)
    
sum(returns)/100

0.78

## Value Iteration on FrozenLake

In [0]:
import gym
import numpy as np
import py_inforce as pin

env = gym.make('FrozenLake-v0', is_slippery=True)  

policy, _ = pin.value_iteration(env)

returns = []

for i in range(100):
    state = env.reset()

    done = False
    ret = 0

    while not done:
        action = np.where(policy[state, :] == 1)
        state, reward, done, _ = env.step(action[0][0])
        ret += reward
    returns.append(ret)
    
sum(returns)/100

0.84

# Policy Gradient Methods

## REINFORCE on Cartpole

In [0]:
from torch.distributions import Categorical
import gym
import torch.nn as nn
from py_inforce.generic.mlp import MLP
from py_inforce.policy_based.REINFORCE import REINFORCE
import torch.optim as optim
import torch
import numpy as np
import py_inforce as pin

env = gym.make('CartPole-v0')
in_dim = env.observation_space.shape[0] # 4
out_dim = env.action_space.n # 2
cart_agent = MLP([in_dim, 128, 128, out_dim], nn.ReLU)
optimizer = optim.Adam(cart_agent.parameters(), lr=cart_agent.lr)

# Stops when the agent achieves a score of 200 just once
pin.REINFORCE(cart_agent, env, Categorical, optimizer, 200, bf = lambda x: x - x.mean(), MAX_EPISODES=500, EARLY = lambda x: x == 200)

done = False

state = env.reset()
rewards = 0

while not done:
    state = torch.from_numpy(state.astype(np.float32))
    pd = Categorical(logits=cart_agent.forward(state))
    action = pd.sample()
    state, reward, done, _ = env.step(action.numpy())
    rewards += reward
    #env.render()
    
rewards

10.0

# Temporal Difference Learning

## SARSA

# Value Based Methods

## Deep Q Learning

In [2]:
import gym
import torch.nn as nn
from py_inforce.generic.mlp import MLP
from py_inforce.value_based.DQN import DQN
from py_inforce.generic.Memories import ReplayMemory
import torch.optim as optim
import torch
import numpy as np
import math

env = gym.make('CartPole-v0')
in_dim = env.observation_space.shape[0] # 4
out_dim = env.action_space.n # 2
q_net = MLP([in_dim, 128, 128, out_dim], nn.ReLU, LEARN_RATE = 0.005)
t_net = MLP([in_dim, 128, 128, out_dim], nn.ReLU, LEARN_RATE = 0.005)
optimizer = optim.Adam
memory = ReplayMemory(1000, in_dim, out_dim)

DQN(env, memory, q_net, optimizer, steps = 10000, eps = 1, disc_factor = 0.99, loss = torch.nn.MSELoss(), batch_sz = 32)
# Note: This was very lucky

converged in 42 steps


## Double Deep Q Learning

In [3]:
import gym
import torch.nn as nn
from py_inforce.generic.mlp import MLP
from py_inforce.value_based.Double_DQN import Double_DQN
from py_inforce.generic.Memories import ReplayMemory
import torch.optim as optim
import torch
import numpy as np
import math

env = gym.make('CartPole-v0')
in_dim = env.observation_space.shape[0] # 4
out_dim = env.action_space.n # 2
q_net = MLP([in_dim, 128, 128, out_dim], nn.ReLU, LEARN_RATE = 0.005)
t_net = MLP([in_dim, 128, 128, out_dim], nn.ReLU)
optimizer = optim.Adam
memory = ReplayMemory(1000, in_dim, out_dim)

Double_DQN(env, memory, q_net, t_net, optimizer, steps = 10000, eps = 1, disc_factor = 0.99, loss = torch.nn.MSELoss(), batch_sz = 32, tgt_update = 100)


converged in 1459 steps
