In [2]:
import numpy as np
import gym
from gym import spaces
import pygame
import torch 

# Import packages
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from typing import List, Tuple
from matplotlib import animation

import collections
from collections import namedtuple, deque
import tqdm
import matplotlib.pyplot as plt
import random
import gymnasium as gym

from IPython.display import clear_output
from IPython import display
from gym.wrappers import FlattenObservation

torch.set_default_dtype(torch.float64)

In [580]:
class balloon3D:
    def __init__(self,width,length,height,obs_size):
        self.obs_size = obs_size
        self.x, self.y , self.z = 0,0,0
        self.height = height
        self.width = width
        self.length = length
        self.map = None

    def reset(self,start_x,start_y,start_z):
        self.x, self.y ,self.z = start_x,start_y,start_z

    def up(self):
        self.z = min(self.height-1,self.z+1)

    def down(self):
        self.z = max(self.z-1,0)

    def step(self):
        newx = int((self.x + self.map[self.x,self.y,self.z,0]) % self.width)
        newy = int((self.y + self.map[self.x,self.y,self.z,1]) % self.length)
        self.x = newx
        self.y = newy

    def generate_map(self,eta,mu):
        self.map = np.zeros((self.width,self.length,self.height,2))
        for i in range(self.width):
            for j in  range(self.length):
                for k in range(self.height):
                    for l in range(2):
                        r = np.random.random()
                        if(r>eta):
                            r= np.random.random()
                            if(r>mu):
                                self.map[i,j,k,l] = 1
                            else:
                                self.map[i,j,k,l] = -1


    def set_map(self,map):
        self.map = map

    def get_winds(self):
        r = self.obs_size
        self.obs = np.zeros((2*r+1,2*r+1,self.height,2))
        for i in range(self.x - r , self.x + r + 1):
            for j in range(self.y - r, self.y + r + 1):
                for k in range(0,self.height):
                    for l in range(2):
                        self.obs[i - self.x + r, j- self.y + r,k,l]  = (self.map[i%self.width,j%self.length,k,l])
        return self.obs
    
    def render_obs(self):
        r =self.obs_size*2+1
        for z in range(self.height):
            for j in range(r):
                s = ''
                for i in range(r):
                    l = ''
                    if(self.obs[i,j,z,0] == 0):
                        l += '.'
                    elif(self.obs[i,j,z,0] == 1):
                        l+='>'
                    elif(self.obs[i,j,z,0] == -1):
                        l+='<'

                    if(self.obs[i,j,z,1] == 0):
                        l += '.'
                    elif(self.obs[i,j,z,1] == 1):
                        l+='v'
                    elif(self.obs[i,j,z,1] == -1):
                        l+='^'                  
                    s+= l+ " "
                print(s)
            print(" ")
    

    
    def render(self):
        for z in range(self.height):
            for j in range(self.length):
                s = ''
                for i in range(self.width):
                    if self.x == i and self.y == j and self.z == z : 
                        s+= 'OO '
                    else:
                        l = ''
                        if(self.map[i,j,z,0] == 0):
                            l += '.'
                        elif(self.map[i,j,z,0] == 1):
                            l+='>'
                        elif(self.map[i,j,z,0] == -1):
                            l+='<'

                        if(self.map[i,j,z,1] == 0):
                            l += '.'
                        elif(self.map[i,j,z,1] == 1):
                            l+='v'
                        elif(self.map[i,j,z,1] == -1):
                            l+='^'
                        
                        s+= l+ " "
                print(s)
            print(" ")
                
        

In [156]:
bal = balloon3D(5,5,3,2)
bal.generate_map(0.5,0.5)
bal.render()

OO <^ >v >. .. 
<v <^ .v .. <. 
.. >. >^ .^ .^ 
.v .^ .. .. <v 
<v >v .. .^ .. 
 
.^ .. .. .. .^ 
.. .v >. >^ <v 
<^ <v >. .. .v 
<^ <. >v .. <. 
>v .. .v .^ >. 
 
>. >^ <. >v <. 
.v .. >^ >. .. 
.^ .v >. .^ .v 
<. .. >. >. >. 
.. .^ .. >. .. 
 


In [157]:
bal.step()
bal.render()

>^ <^ >v >. .. 
<v <^ .v .. <. 
.. >. >^ .^ .^ 
.v .^ .. .. <v 
<v OO .. .^ .. 
 
.^ .. .. .. .^ 
.. .v >. >^ <v 
<^ <v >. .. .v 
<^ <. >v .. <. 
>v .. .v .^ >. 
 
>. >^ <. >v <. 
.v .. >^ >. .. 
.^ .v >. .^ .v 
<. .. >. >. >. 
.. .^ .. >. .. 
 


In [159]:
bal.get_winds()
bal.redner_obs()

.^ .. . ^ .^ 
>v .v .^ .. .. 
.. >v v .. .^ 
.. ^ <^ v >. 
>. >v >^ .v .. 
 
.v <^ <v >. .. 
>. <^ >. >v .. 
. v .. .v .^ 
.^ .^ .. .. .. 
<v .. .v . ^ 
 
.v .^ .v >. .^ 
. >. .. . . 
.. .. .^ .. . 
<. . ^ <. v 
.. .v .. >^ . 
 


In [964]:
class BalEnv(gym.Env):
    def __init__(self, render_mode=None, s_x = 10 , s_y =10 , s_z = 3, obs_size = 5 ):
        self.balloon = balloon3D(s_x,s_y,s_z,obs_size)
        self.obs_size = obs_size

        self._target_location = [s_x-1,s_y-1,s_z-1]


        # Observations are dictionaries with the agent's and the target's location.
        # Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
        self.observation_space = spaces.Dict(
            {
                #"agent": spaces.Box(0, size - 1, shape=(2,), dtype=int),
                #"target": spaces.Box(0, size - 1, shape=(2,), dtype=int),
                "map": spaces.Box(-1, 1, shape=(2*obs_size + 1,obs_size*2+1,s_z,2), dtype=int),
            }
        )
        

        self.action_space = spaces.Discrete(3)

    def reset(self):
        self.evo_time = 100

        self.reward = 0

        self.reward_x, self.reward_y = 0,0

        self.balloon.generate_map(0.35,0.5)

        self.time = 0

        self.balloon.reset(0,0,0)

        self._agent_location = [0,0,0]


        observation = self._get_obs()

        return observation, {}

    def step(self, action):
        self.time += 1

        if action == 0:
            self.balloon.map[self.balloon.x , self.balloon.y,self.balloon.z,0] +  self.balloon.map[self.balloon.x , self.balloon.y,self.balloon.z,1] 
            self.balloon.step()
        if action == 1:
            self.balloon.up()
        if action == 2:
            self.balloon.down()


        
        self._agent_location = np.array([self.balloon.x , self.balloon.y,self.balloon.z])
        terminated  = ((self._agent_location[0]==self._target_location[0])and(self._agent_location[1]==self._target_location[1]))
        
        distx = min(((self._agent_location[0] - self._target_location[0])%self.balloon.width),(self._target_location[0] - self._agent_location[0])%self.balloon.width)
        disty = min(((self._agent_location[1] - self._target_location[1])%self.balloon.length),(self._target_location[1] - self._agent_location[1])%self.balloon.length)



        self.reward = -(np.sqrt((distx)**2 + (disty)**2))

        observation = self._get_obs()

        return observation, self.reward, terminated, False, {}
    
    def _get_obs(self):
        return {"map" : self.balloon.get_winds()}

    def render(self):
        self.balloon.render()
        
    def close(self):
        pass

In [860]:
env = BalEnv()
env.reset()


({'map': array([[[[-1.,  0.],
           [ 1.,  1.],
           [-1.,  1.]],
  
          [[-1.,  0.],
           [ 1., -1.],
           [-1.,  1.]],
  
          [[-1.,  1.],
           [-1.,  1.],
           [-1., -1.]],
  
          [[-1.,  0.],
           [ 1.,  1.],
           [ 0.,  1.]],
  
          [[-1.,  0.],
           [ 1.,  1.],
           [-1.,  1.]],
  
          [[-1., -1.],
           [ 0., -1.],
           [ 1., -1.]],
  
          [[-1.,  1.],
           [ 1.,  0.],
           [-1., -1.]],
  
          [[ 1.,  1.],
           [-1., -1.],
           [-1.,  0.]],
  
          [[ 0., -1.],
           [-1.,  0.],
           [ 1.,  0.]],
  
          [[ 0.,  1.],
           [ 0., -1.],
           [-1.,  1.]],
  
          [[-1.,  0.],
           [ 1.,  1.],
           [-1.,  1.]]],
  
  
         [[[-1.,  1.],
           [-1.,  1.],
           [ 0.,  1.]],
  
          [[ 1., -1.],
           [ 1.,  1.],
           [-1.,  0.]],
  
          [[ 0.,  1.],
           [-1., 

In [963]:
a = env.action_space.sample()
print(a)
_,r,_,_,_ = env.step(a)
env.render()
print(r)

1
<v <. .^ <^ <v <^ <^ <^ >v <^ 
>^ <. >^ >^ <^ <v <. <^ <^ >^ 
<^ >v >v <^ <. >v >^ .. <. <. 
>v <v <^ >v >v .^ .^ .v .v >^ 
<. <v >. <^ >^ .v >^ >. <^ <v 
<^ <. <^ <^ >^ <. <v >v <^ >v 
>v <^ >^ >v >v <. >^ >v .^ .. 
<v <^ >^ .. <v <v .v <v >^ >v 
>. <^ <v <^ <v <. >^ <. <^ >^ 
.^ .. <^ <v <v <. >^ <v .^ >v 
 
<^ <^ <^ >v .^ OO >^ >v >^ .. 
<^ <v .^ <v <v >. <v >v >v >^ 
.. .. <. >^ .^ <^ >v .^ .v .v 
>^ >. .. .^ <^ <. <^ <^ .v .v 
<v <^ <^ >^ <^ .^ <. >^ <. <v 
<v .^ .^ >^ .^ >v <v <. <^ .^ 
<v <. <^ >^ >v >^ >v <v .^ <^ 
>^ <^ <v <v >^ <v <v >v <^ <v 
>v >v >v .v .. >v <. .^ .^ .^ 
<^ .v >^ <v .^ >v <v <v >. .^ 
 
<^ >^ >^ .^ >^ >^ >. <. >v >v 
<v <. >^ <^ .^ <^ >. <v <^ >^ 
>v >. >v <^ <v <. <^ >^ <^ .^ 
<^ >v >^ <. <^ >. <. >^ >^ <^ 
<v <^ <v <v <^ <v <v >^ <^ <v 
<. >v >v .. >v <v .v <^ <^ .. 
<v >v >^ .. >^ <v <. >. <v <^ 
>v <v >. .. <v <^ .v >^ >^ >v 
<v <^ <^ <^ >v .v >. <v <^ >v 
.. >. <. >v .^ <v >. <^ >^ .v 
 
-4.123105625617661


In [965]:
class FullyConnectedModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(FullyConnectedModel, self).__init__()
        self.soft = torch.nn.Softmax(dim = -1)
        # Define layers with ReLU activation
        self.linear1 = nn.Linear(input_size, 30)
        self.activation1 = nn.Sigmoid()
        self.linear2 = nn.Linear(30, 30)
        self.activation2 = nn.Sigmoid()
        self.linear3 = nn.Linear(30, 30)
        self.activation3 = nn.Sigmoid()

        # Output layer without activation function
        self.output_layer = nn.Linear(30, output_size)

        # Initialization using Xavier uniform (a popular technique for initializing weights in NNs)
        nn.init.xavier_normal_(self.linear1.weight)
        nn.init.xavier_normal_(self.linear2.weight)
        nn.init.xavier_normal_(self.linear3.weight)
        nn.init.xavier_normal_(self.output_layer.weight)

    def forward(self, inputs):
        # Forward pass through the layers
        x = self.activation1(self.linear1(inputs))
        x = self.activation2(self.linear2(x))
        x = self.activation3(self.linear3(x))
        x = self.soft(self.output_layer(x))
        return x

    
class QNetwork:
    def __init__(self, env, lr, logdir=None):
        # Define Q-network with specified architecture
        self.net = FullyConnectedModel(4, 2)
        self.env = env
        self.lr = lr 
        self.logdir = logdir
        self.optimizer = optim.Adam(self.net.parameters(), lr=self.lr)

    def load_model(self, model_file):
        # Load pre-trained model from a file
        return self.net.load_state_dict(torch.load(model_file))

    def load_model_weights(self, weight_file):
        # Load pre-trained model weights from a file
        return self.net.load_state_dict(torch.load(weight_file))
    
class QPolicy:
    def __init__(self, s_size, a_size):
        self.net = FullyConnectedModel(s_size,a_size)

    def act(self, state):
        state = torch.tensor(state)
        probs = self.net(state)
        m = torch.distributions.Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

In [1017]:
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every,env):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_training_episodes + 1):
        saved_log_probs = []
        rewards = []
        state, _ = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(torch.tensor(state,dtype=torch.double))
            saved_log_probs.append(log_prob)
            state, reward, done, truncated , _ = env.step(action)
            rewards.append(reward)
            if done or truncated:
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        returns = deque(maxlen=max_t)
        n_steps = len(rewards)
        for t in range(n_steps)[::-1]:
            disc_return_t = returns[0] if len(returns) > 0 else 0
            returns.appendleft(gamma * disc_return_t + rewards[t])

        eps = np.finfo(np.float32).eps.item()
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)


        policy_loss = 0
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss += (-log_prob * disc_return)

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        print(policy.net.linear1.weight.grad)
        if i_episode % print_every == 0:
            print("Episode {}\tAverage Score: {:.2f}".format(i_episode, np.mean(scores_deque)))

    return scores

In [1020]:
# Create policy and place it to the device

from gym.wrappers import FlattenObservation



hyperparameters = {
    "n_training_episodes": 100,
    "n_evaluation_episodes": 10,
    "max_t": 30,
    "gamma": 1.0,
    "lr": 1e-2,
    "grid_size" : 5,
    "obs_r" :2,
    "action_space": 3,
    "print" : 1
}

policy = QPolicy(
    ((hyperparameters["obs_r"]*2+1)**2)*2*3,hyperparameters["action_space"]
)
optimizer = optim.Adam(policy.net.parameters(), lr=hyperparameters["lr"])

In [993]:
env = BalEnv(obs_size=hyperparameters["obs_r"])
wrapped_env = FlattenObservation(env)



Episode 1	Average Score: -54.25
tensor([[-0.0019,  0.0009, -0.0019,  ..., -0.0033,  0.0038, -0.0005],
        [-0.0090,  0.0019, -0.0101,  ..., -0.0123,  0.0132, -0.0011],
        [-0.0086, -0.0034,  0.0019,  ..., -0.0047,  0.0014,  0.0032],
        ...,
        [ 0.0088,  0.0033, -0.0018,  ...,  0.0052, -0.0021, -0.0031],
        [-0.0078, -0.0044,  0.0043,  ..., -0.0026, -0.0015,  0.0040],
        [ 0.0105,  0.0017,  0.0047,  ...,  0.0095, -0.0073, -0.0021]])


  state = torch.tensor(state)


In [1021]:
scores = reinforce(
    policy,
    optimizer,
    hyperparameters["n_training_episodes"],
    hyperparameters["max_t"],
    hyperparameters["gamma"],
    hyperparameters["print"],
    wrapped_env)


  state = torch.tensor(state)


tensor([[ 0.0034,  0.0113, -0.0185,  ...,  0.0154,  0.0165, -0.0091],
        [-0.0001, -0.0045,  0.0072,  ..., -0.0047, -0.0057,  0.0026],
        [-0.0039, -0.0031,  0.0064,  ..., -0.0067, -0.0067,  0.0031],
        ...,
        [-0.0017,  0.0040, -0.0066,  ...,  0.0031,  0.0043, -0.0015],
        [ 0.0008, -0.0025,  0.0041,  ..., -0.0016, -0.0026,  0.0005],
        [ 0.0003,  0.0012, -0.0019,  ...,  0.0016,  0.0017, -0.0010]])
Episode 1	Average Score: -19.84
tensor([[-9.4916e-03, -2.1056e-02,  2.9385e-04,  ..., -1.2353e-02,
          1.1586e-03,  6.2983e-03],
        [ 5.9562e-03,  1.8235e-02,  1.1367e-05,  ...,  1.2769e-02,
         -3.5264e-03, -4.4532e-03],
        [ 8.0368e-04,  1.0590e-02, -1.6922e-03,  ...,  8.9573e-03,
          3.1273e-03, -8.5786e-04],
        ...,
        [-4.8650e-03, -8.5564e-03,  1.4294e-03,  ..., -5.6652e-03,
          8.3277e-03,  6.9602e-03],
        [ 6.0479e-03,  1.4519e-02, -3.2322e-04,  ...,  1.2047e-02,
         -7.1740e-03, -5.3334e-03],
      

  returns = (returns - returns.mean()) / (returns.std() + eps)


ValueError: Expected parameter probs (Tensor of shape (3,)) of distribution Categorical(probs: torch.Size([3])) to satisfy the constraint Simplex(), but found invalid values:
tensor([nan, nan, nan], grad_fn=<DivBackward0>)