# Reinforcement Learning - Syft Duet - Data Scientist 🥁

Contributed by [@Koukyosyumei](https://github.com/Koukyosyumei)

## PART 1: Connect to a Remote Duet Server

As the Data Scientist, you want to perform data science on data that is sitting in the Data Owner's Duet server in their Notebook.

In order to do this, we must run the code that the Data Owner sends us, which importantly includes their Duet Session ID. The code will look like this, importantly with their real Server ID.

```
import syft as sy
duet = sy.duet('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
```

This will create a direct connection from my notebook to the remote Duet server. Once the connection is established all traffic is sent directly between the two nodes.

Paste the code or Server ID that the Data Owner gives you and run it in the cell below. It will return your Client ID which you must send to the Data Owner to enter into Duet so it can pair your notebooks.

In [None]:
from itertools import count

import numpy as np
import syft as sy
duet = sy.join_duet(loopback=True)
sy.logger.add(sink="./syft_ds.log")

In [None]:
sy.load("gym")
sy.load("numpy")

In [None]:
config = {
    "gamma": 0.99,
    "seed": 543,
    "render": False,
    "log_interval": 10,
    "no_cuda": False,
    "log_interval": 1,
    "wait_interval": 1,
}

remote_torch = duet.torch
remote_torch.manual_seed(config["seed"])

In [None]:
has_cuda = False
has_cuda_ptr = remote_torch.cuda.is_available()

# lets ask to see if our Data Owner has CUDA
has_cuda = bool(
    has_cuda_ptr.get(
        request_block=True,
        reason="To run test and inference locally",
        timeout_secs=3,  # change to something slower
    )
)
print("Is cuda available ? : ", has_cuda)


use_cuda = not config["no_cuda"] and has_cuda
# now we can set the seed
remote_torch.manual_seed(config["seed"])

device = remote_torch.device("cuda" if use_cuda else "cpu")
# print(f"Data Owner device is {device.type.get()}")

In [None]:
class Policy(sy.Module):
    def __init__(self, torch_ref):
        super(Policy, self).__init__(torch_ref=torch_ref)
        self.affine1 = self.torch_ref.nn.Linear(4, 128)
        self.dropout = self.torch_ref.nn.Dropout(p=0.6)
        self.relu = self.torch_ref.nn.ReLU(True)
        self.affine2 = self.torch_ref.nn.Linear(128, 2)
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
        x = self.dropout(x)
        x = self.relu(x)
        action_scores = self.affine2(x)
        return remote_torch.softmax(action_scores, dim=1)

In [None]:
# send our model to remote
policy = Policy(torch)
remote_policy = policy.send(duet)

optimizer = remote_torch.optim.Adam(remote_policy.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()

In [None]:
# if we have CUDA lets send our model to the GPU
if has_cuda:
    remote_policy.cuda(device)
else:
    remote_policy.cpu()

In [None]:
# You cannot see the state
def select_action(state):
    state = remote_torch.from_numpy(state).float().unsqueeze(0)
    probs_ptr = remote_policy(state)
    m = remote_torch.distributions.Categorical(probs_ptr)
    action = m.sample()
    remote_policy.saved_log_probs.append(m.log_prob(action))
    return action.item()


def finish_episode():
    R = 0
    policy_loss = 0
    returns = []
    
    for r in remote_policy.rewards[::-1]:
        R = r + config["gamma"] * R
        returns.insert(0, R)
        
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    
    for log_prob, R in zip(remote_policy.saved_log_probs, returns):
        temp = -log_prob * R
        policy_loss += temp.get()
        
    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()
    del remote_policy.rewards[:]
    del remote_policy.saved_log_probs[:]

In [None]:
reward_threshold_ptr = duet.store["reward_threshold"]
reward_threshold = reward_threshold_ptr.get(request_block=True, delete_obj=False)
print(f"reward_threshold is {reward_threshold}")

In [None]:
remote_env = duet.gym.make("CartPole-v0")
remote_env.seed(42)

In [None]:
running_reward = 10

# run inifinitely many episodes
for i_episode in count(1):

    # reset environment and episode reward
    state = remote_env.reset()
    ep_reward = 0

    # for each episode, only run 9999 steps so that we don't
    # infinite loop while learning
    for t in range(1, 10000):
        # select action from policy
        action = select_action(state)

        # take the action
        state, reward, done, _ = remote_env.step(action)
        reward = reward.get()
        #if args.render:
        #    env.render()

        remote_policy.rewards.append(reward)
        ep_reward += reward
        
        if done.get():
            break

    # update cumulative reward
    running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

    # perform backprop
    finish_episode()

    # log results
    if i_episode % config["log_interval"] == 0:
        print(
                "Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}".format(
                    i_episode, ep_reward, running_reward
                )
            )

    # check if we have "solved" the cart pole problem
    if running_reward > remote_env.spec.reward_threshold:
        print(
                "Solved! Running reward is now {} and "
                "the last episode runs to {} time steps!".format(running_reward, t)
            )
        break
