## Run 1 

In [6]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Active cases                                         0           Inf
        2	Recovery Rate = 100 - Death Rate=Recovered/Confirmed 0           Inf
        3	Population Density                                   0           Inf
        4	Projected Cases (to be introduced later by ABM)      0           Inf
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros(4)
        high = np.ones(4) * np.inf
        self.observation_space = spaces.Box(low, high, dtype = np.float32)
        #actions are vectors of the form [n1, n2, n3,...nk, r] for k states and r reserved amount of drug 
        self.action_space = spaces.Box(low = np.zeros((s+1, ), dtype = int), high = np.array([100]*(s+1)), shape = (s + 1, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.rr = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.80
        self.epsilon = 0.4  

        
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(80188,28329, 0., 11297 ),  
                              (30709,6511,0., 308),
                              (16944,3186,0., 201),
                              (12965,2444,0., 236),
                              (159133,67615,0., 365)])
                               # Confirmed   Active  Recovery Rate(due to effect of drug) Population Density
                               # Delhi, Guj, Raja, MP, Maha 
        #store the actions in an array 
        self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        #Declare the Value table 
        self.valueMap = np.zeros((self.states, 100))
        return self.states_cond, self.action_list
        

    def step(self, action):
        """
        Assumptions:
        1. Drug has 50% efficacy 
        2. Vaccine is passive, not antigen based- works to fight off existing infection.
        3. 1 person requires 1 vial (dose) only.
        4. No of confirmed and active cases in one particular region is constant, until we integrate the ABM projections model. 
        So, for the time being, recovery rate will always increase when drug is supplied to a particular state.
        
    """
        
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
                    
        #start with equal distribution 
        if self.curr_step == 1:
            self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
                action[i] = np.random.randint(0, 100/(self.states+1))
            reserved = 100-sum(action)
            action[self.states] = reserved
        else:
            for i in range(self.states):
                action[i] = np.argmax(self.valueMap[i])+1 
            reserved = 100 - sum(action) 
            action[self.states] = reserved

        #update action_list to store only the most recently used action values 
        self.action_list = action
        print("Distribution set: ",self.action_list)
        

        #no of units distrbuted to respective states 
        # received = []        
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
        reserved_qty = self.total*self.action_list[self.states]/100
        print("reserved quantity: ", reserved_qty)
        
        
        #simulation
        recovered = [0]*self.states
        
        for i in range(self.states):
            recovered[i] = 0.5*self.get_discrete_int(self.received[i])  #50% efficacy
            self.rr[i] = recovered[i]/self.states_cond[i][0] 
        print("recovery rate due to drug: ",self.rr)
        self.states_cond = np.array(self.states_cond)
        print("recovery rate(before): ", self.states_cond[:,2])       #recovery rate for the 'i'th state                         
        self.states_cond[:, 2] = self.rr                            #update values in states_cond matrix 
        self.states_cond[:, 1] -= recovered 

          
        #reward only when task done 
        reward = self.get_reward()

        #update the value map
        copyValueMap = np.copy(self.valueMap)
        deltaState = [0]*self.states
        for state in range(self.states):
            value = np.zeros((self.states, ))
            value += reward+(self.gamma*self.valueMap[state, self.get_discrete_int(self.action_list[state])])
            deltaState = np.append(deltaState, np.abs(copyValueMap[state, self.get_discrete_int(self.action_list[state])]-value[state]))
            copyValueMap[state, self.get_discrete_int(self.action_list[state])]= value[state]
        valueMap = copyValueMap
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode': self.curr_step,
                                                    'change': deltaState}
    
    def get_reward(self):
        for i in range(self.states):
            cpu = 1
            reward = [0]*self.states
            reward[i] = self.rr[i]*math.exp(-cpu*self.received[i])
        reward = sum(reward)
        return reward 


   
    def close(self):
        pass 

locations = 5
episodes = 50
total_drugs_qty = 10000
#create an instance of the class 
env = StatesEnv(locations, episodes, total_drugs_qty)

action = [16.66, 16.66, 16.66, 16.66, 16.66, 16.66]
delta = []


obs = env.reset()
for ep in range(episodes):
    obs, reward, done, info = env.step(action)
    delta = np.append(delta, info)
    if ep%10 == 0:
        print("Episode {}".format(ep+1))        
        print("obs=", obs, "reward=", reward, "done=", done)        
    
#print(delta)
#     plt.figure(figsize=(20, 10))
#     plt.rcParams.update({'figure.max_open_warning': 0})
#     if done: 
#             print("Done:)")
#             break
# for l in range(locations):    
#     plt.subplot(locations, 1, l+1)
#     plt.plot([5,10,15,20,25,30,35,40,45,50], delta[l], 'b-')
# plt.show()


env.close()




Distribution set:  [1, 1, 1, 1, 1, 78.34]
reserved quantity:  7834.0
recovery rate due to drug:  [0.0006235346934703446, 0.0016281871763977986, 0.002950897072710104, 0.0038565368299267257, 0.0003142025852588715]
recovery rate(before):  [0. 0. 0. 0. 0.]
Episode 1
obs= [[8.01880000e+04 2.82790000e+04 6.23534693e-04 1.12970000e+04]
 [3.07090000e+04 6.46100000e+03 1.62818718e-03 3.08000000e+02]
 [1.69440000e+04 3.13600000e+03 2.95089707e-03 2.01000000e+02]
 [1.29650000e+04 2.39400000e+03 3.85653683e-03 2.36000000e+02]
 [1.59133000e+05 6.75650000e+04 3.14202585e-04 3.65000000e+02]] reward= 1.1688574890251663e-47 done= False
Distribution set:  [1, 1, 1, 1, 1, 16.659999999999997]
reserved quantity:  1665.9999999999998
recovery rate due to drug:  [0.0006235346934703446, 0.0016281871763977986, 0.002950897072710104, 0.0038565368299267257, 0.0003142025852588715]
recovery rate(before):  [0.00062353 0.00162819 0.0029509  0.00385654 0.0003142 ]
Distribution set:  [1, 1, 1, 1, 1, 78.34]
reserved quan

## Run 2

In [1]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Active cases                                         0           Inf
        2	Recovery Rate = 100 - Death Rate=Recovered/Confirmed 0           Inf
        3	Population Density                                   0           Inf
        4	Projected Cases (to be introduced later by ABM)      0           Inf
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros(4)
        high = np.ones(4) * np.inf
        self.observation_space = spaces.Box(low, high, dtype = np.float32)
        #actions are vectors of the form [n1, n2, n3,...nk, r] for k states and r reserved amount of drug 
        self.action_space = spaces.Box(low = np.zeros((s+1, ), dtype = int), high = np.array([100]*(s+1)), shape = (s + 1, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.rr = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.80
        self.epsilon = 0.4
       
       
        
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(94695,26148, 0.0006235346934703446, 11297 ),
                              (34600,7763,0.0016281871763977986, 308),
                              (19052,3331,0.002950897072710104, 201),
                              (14297,2655,0.0038565368299267257, 236),
                              (192990,79927,0.0003142025852588715, 365)])
                               # Confirmed   Active  Recovery Rate(due to effect of drug) Population Density
                               # Delhi, Guj, Raja, MP, Maha, TN 
        #store the actions in an array 
        self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        #Declare the Value table 
        self.valueMap = np.zeros((self.states, 100))
        return self.states_cond, self.action_list
        

    def step(self, action):
        """
        Assumptions:
        1. Drug has 50% efficacy 
        2. Vaccine is passive, not antigen based- works to fight off existing infection.
        3. 1 person requires 1 vial (dose) only.
        4. No of confirmed and active cases in one particular region is constant, until we integrate the ABM projections model. 
        So, for the time being, recovery rate will always increase when drug is supplied to a particular state.
        
    """
        
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
                    
        #start with equal distribution 
        if self.curr_step == 1:
            self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
                action[i] = np.random.randint(0, 100/(self.states+1))
            reserved = 100-sum(action)
            action[self.states] = reserved
        else:
            for i in range(self.states):
                action[i] = np.argmax(self.valueMap[i])+1 
            reserved = 100 - sum(action) 
            action[self.states] = reserved

        #update action_list to store only the most recently used action values 
        self.action_list = action
        print("Distribution set: ",self.action_list)
        

        #no of units distrbuted to respective states 
        # received = []        
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
        reserved_qty = self.total*self.action_list[self.states]/100
        print("reserved quantity: ", reserved_qty)
        
        
        #simulation
        recovered = [0]*self.states
        
        for i in range(self.states):
            recovered[i] = 0.5*self.get_discrete_int(self.received[i])  #50% efficacy
            self.rr[i] = recovered[i]/self.states_cond[i][0] 
        print("recovery rate due to drug: ",self.rr)
        self.states_cond = np.array(self.states_cond)
        print("recovery rate(before): ", self.states_cond[:,2])       #recovery rate for the 'i'th state                         
        self.states_cond[:, 2] = self.rr                            #update values in states_cond matrix 
        self.states_cond[:, 1] -= recovered 

          
        #reward only when task done 
        reward = self.get_reward()

        #update the value map
        copyValueMap = np.copy(self.valueMap)
        deltaState = [0]*self.states
        for state in range(self.states):
            value = np.zeros((self.states, ))
            value += reward+(self.gamma*self.valueMap[state, self.get_discrete_int(self.action_list[state])])
            deltaState = np.append(deltaState, np.abs(copyValueMap[state, self.get_discrete_int(self.action_list[state])]-value[state]))
            copyValueMap[state, self.get_discrete_int(self.action_list[state])]= value[state]
        valueMap = copyValueMap
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode': self.curr_step,
                                                    'change': deltaState}
    
    def get_reward(self):
        for i in range(self.states):
            cpu = 1
            reward = [0]*self.states
            reward[i] = self.rr[i]*math.exp(-cpu*self.received[i])
        reward = sum(reward)
        return reward 


   
    def close(self):
        pass 

locations = 5
episodes = 50
total_drugs_qty = 10000
#create an instance of the class 
env = StatesEnv(locations, episodes, total_drugs_qty)

action = [16.66, 16.66, 16.66, 16.66, 16.66, 16.66]
delta = []


obs = env.reset()
for ep in range(episodes):
    obs, reward, done, info = env.step(action)
    delta = np.append(delta, info)
    if ep%10 == 0:
        print("Episode {}".format(ep+1))        
        print("obs=", obs, "reward=", reward, "done=", done)        
    
#print(delta)
#     plt.figure(figsize=(20, 10))
#     plt.rcParams.update({'figure.max_open_warning': 0})
#     if done: 
#             print("Done:)")
#             break
# for l in range(locations):    
#     plt.subplot(locations, 1, l+1)
#     plt.plot([5,10,15,20,25,30,35,40,45,50], delta[l], 'b-')
# plt.show()


env.close()




Distribution set:  [1, 1, 1, 1, 1, 78.34]
reserved quantity:  7834.0
recovery rate due to drug:  [0.0005280109826284387, 0.001445086705202312, 0.002624396388830569, 0.0034972371826257256, 0.0002590807813876367]
recovery rate(before):  [0.00062353 0.00162819 0.0029509  0.00385654 0.0003142 ]
Episode 1
obs= [[9.46950000e+04 2.60980000e+04 5.28010983e-04 1.12970000e+04]
 [3.46000000e+04 7.71300000e+03 1.44508671e-03 3.08000000e+02]
 [1.90520000e+04 3.28100000e+03 2.62439639e-03 2.01000000e+02]
 [1.42970000e+04 2.60500000e+03 3.49723718e-03 2.36000000e+02]
 [1.92990000e+05 7.98770000e+04 2.59080781e-04 3.65000000e+02]] reward= 9.638001906888535e-48 done= False
Distribution set:  [1, 1, 1, 1, 1, 16.659999999999997]
reserved quantity:  1665.9999999999998
recovery rate due to drug:  [0.0005280109826284387, 0.001445086705202312, 0.002624396388830569, 0.0034972371826257256, 0.0002590807813876367]
recovery rate(before):  [0.00052801 0.00144509 0.0026244  0.00349724 0.00025908]
Distribution set: 



## Run 3

In [3]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Active cases                                         0           Inf
        2	Recovery Rate = 100 - Death Rate=Recovered/Confirmed 0           Inf
        3	Population Density                                   0           Inf
        4	Projected Cases (to be introduced later by ABM)      0           Inf
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros(4)
        high = np.ones(4) * np.inf
        self.observation_space = spaces.Box(low, high, dtype = np.float32)
        #actions are vectors of the form [n1, n2, n3,...nk, r] for k states and r reserved amount of drug 
        self.action_space = spaces.Box(low = np.zeros((s+1, ), dtype = int), high = np.array([100]*(s+1)), shape = (s + 1, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.rr = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.80
        self.epsilon = 0.4
       
       
        
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(109140,21146, 0.003696076878399071, 11297 ),
                              (40069,9900,0.002890173410404624, 308),
                              (23174,5057,0.007873189166491707, 201),
                              (16657,3538,0.03497237182625726, 236),
                              (238461,95943,0.0005181615627752734, 365)])
                               # Confirmed   Active  Recovery Rate(due to effect of drug) Population Density
                               # Delhi, Guj, Raja, MP, Maha
        #store the actions in an array 
        self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        #Declare the Value table 
        self.valueMap = np.zeros((self.states, 100))
        return self.states_cond, self.action_list
        

    def step(self, action):
        """
        Assumptions:
        1. Drug has 50% efficacy 
        2. Vaccine is passive, not antigen based- works to fight off existing infection.
        3. 1 person requires 1 vial (dose) only.
        4. No of confirmed and active cases in one particular region is constant, until we integrate the ABM projections model. 
        So, for the time being, recovery rate will always increase when drug is supplied to a particular state.
        
    """
        
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
                    
        #start with equal distribution 
        if self.curr_step == 1:
            self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
                action[i] = np.random.randint(0, 100/(self.states+1))
            reserved = 100-sum(action)
            action[self.states] = reserved
        else:
            for i in range(self.states):
                action[i] = np.argmax(self.valueMap[i])+1 
            reserved = 100 - sum(action) 
            action[self.states] = reserved

        #update action_list to store only the most recently used action values 
        self.action_list = action
        print("Distribution set: ",self.action_list)
        

        #no of units distrbuted to respective states 
        # received = []        
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
        reserved_qty = self.total*self.action_list[self.states]/100
        print("reserved quantity: ", reserved_qty)
        
        
        #simulation
        recovered = [0]*self.states
        
        for i in range(self.states):
            recovered[i] = 0.5*self.get_discrete_int(self.received[i])  #50% efficacy
            self.rr[i] = recovered[i]/self.states_cond[i][0] 
        print("recovery rate due to drug: ",self.rr)
        self.states_cond = np.array(self.states_cond)
        print("recovery rate(before): ", self.states_cond[:,2])       #recovery rate for the 'i'th state                         
        self.states_cond[:, 2] = self.rr                            #update values in states_cond matrix 
        self.states_cond[:, 1] -= recovered 

          
        #reward only when task done 
        reward = self.get_reward()

        #update the value map
        copyValueMap = np.copy(self.valueMap)
        deltaState = [0]*self.states
        for state in range(self.states):
            value = np.zeros((self.states, ))
            value += reward+(self.gamma*self.valueMap[state, self.get_discrete_int(self.action_list[state])])
            deltaState = np.append(deltaState, np.abs(copyValueMap[state, self.get_discrete_int(self.action_list[state])]-value[state]))
            copyValueMap[state, self.get_discrete_int(self.action_list[state])]= value[state]
        valueMap = copyValueMap
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode': self.curr_step,
                                                    'change': deltaState}
    
    def get_reward(self):
        for i in range(self.states):
            cpu = 1
            reward = [0]*self.states
            reward[i] = self.rr[i]*math.exp(-cpu*self.received[i])
        reward = sum(reward)
        return reward 


   
    def close(self):
        pass 

locations = 5
episodes = 50
total_drugs_qty = 10000
#create an instance of the class 
env = StatesEnv(locations, episodes, total_drugs_qty)

action = [16.66, 16.66, 16.66, 16.66, 16.66, 16.66]
delta = []


obs = env.reset()
for ep in range(episodes):
    obs, reward, done, info = env.step(action)
    delta = np.append(delta, info)
    if ep%10 == 0:
        print("Episode {}".format(ep+1))        
        print("obs=", obs, "reward=", reward, "done=", done)        
    
#print(delta)
#     plt.figure(figsize=(20, 10))
#     plt.rcParams.update({'figure.max_open_warning': 0})
#     if done: 
#             print("Done:)")
#             break
# for l in range(locations):    
#     plt.subplot(locations, 1, l+1)
#     plt.plot([5,10,15,20,25,30,35,40,45,50], delta[l], 'b-')
# plt.show()


env.close()




Distribution set:  [1, 1, 1, 1, 1, 78.34]
reserved quantity:  7834.0
recovery rate due to drug:  [0.0004581271761040865, 0.0012478474631261074, 0.0021575904030378874, 0.003001741009785676, 0.0002096778928210483]
recovery rate(before):  [0.00369608 0.00289017 0.00787319 0.03497237 0.00051816]
Episode 1
obs= [[1.09140000e+05 2.10960000e+04 4.58127176e-04 1.12970000e+04]
 [4.00690000e+04 9.85000000e+03 1.24784746e-03 3.08000000e+02]
 [2.31740000e+04 5.00700000e+03 2.15759040e-03 2.01000000e+02]
 [1.66570000e+04 3.48800000e+03 3.00174101e-03 2.36000000e+02]
 [2.38461000e+05 9.58930000e+04 2.09677893e-04 3.65000000e+02]] reward= 7.800176917862536e-48 done= False
Distribution set:  [1, 1, 1, 1, 1, 16.659999999999997]
reserved quantity:  1665.9999999999998
recovery rate due to drug:  [0.0004581271761040865, 0.0012478474631261074, 0.0021575904030378874, 0.003001741009785676, 0.0002096778928210483]
recovery rate(before):  [0.00045813 0.00124785 0.00215759 0.00300174 0.00020968]
Distribution set

## Run 4

In [4]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Active cases                                         0           Inf
        2	Recovery Rate = 100 - Death Rate=Recovered/Confirmed 0           Inf
        3	Population Density                                   0           Inf
        4	Projected Cases (to be introduced later by ABM)      0           Inf
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros(4)
        high = np.ones(4) * np.inf
        self.observation_space = spaces.Box(low, high, dtype = np.float32)
        #actions are vectors of the form [n1, n2, n3,...nk, r] for k states and r reserved amount of drug 
        self.action_space = spaces.Box(low = np.zeros((s+1, ), dtype = int), high = np.array([100]*(s+1)), shape = (s + 1, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.rr = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.80
        self.epsilon = 0.4
       
       
        
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(115346,18664, 0.004581271761040865, 11297 ),
                              (43637,11065,0.002495694926252215, 308),
                              (25571,5885,0.010787952015189436, 201),
                              (19005,4757,0.03602089211742811, 236),
                              (267665,107963,0.0, 365)])
                               # Confirmed   Active  Recovery Rate(due to effect of drug) Population Density
                               # Delhi, Guj, Raja, MP, Maha
        #store the actions in an array 
        self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        #Declare the Value table 
        self.valueMap = np.zeros((self.states, 100))
        return self.states_cond, self.action_list
        

    def step(self, action):
        """
        Assumptions:
        1. Drug has 50% efficacy 
        2. Vaccine is passive, not antigen based- works to fight off existing infection.
        3. 1 person requires 1 vial (dose) only.
        4. No of confirmed and active cases in one particular region is constant, until we integrate the ABM projections model. 
        So, for the time being, recovery rate will always increase when drug is supplied to a particular state.
        
    """
        
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
                    
        #start with equal distribution 
        if self.curr_step == 1:
            self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
                action[i] = np.random.randint(0, 100/(self.states+1))
            reserved = 100-sum(action)
            action[self.states] = reserved
        else:
            for i in range(self.states):
                action[i] = np.argmax(self.valueMap[i])+1 
            reserved = 100 - sum(action) 
            action[self.states] = reserved

        #update action_list to store only the most recently used action values 
        self.action_list = action
        print("Distribution set: ",self.action_list)
        

        #no of units distrbuted to respective states 
        # received = []        
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
        reserved_qty = self.total*self.action_list[self.states]/100
        print("reserved quantity: ", reserved_qty)
        
        
        #simulation
        recovered = [0]*self.states
        
        for i in range(self.states):
            recovered[i] = 0.5*self.get_discrete_int(self.received[i])  #50% efficacy
            self.rr[i] = recovered[i]/self.states_cond[i][0] 
        print("recovery rate due to drug: ",self.rr)
        self.states_cond = np.array(self.states_cond)
        print("recovery rate(before): ", self.states_cond[:,2])       #recovery rate for the 'i'th state                         
        self.states_cond[:, 2] = self.rr                            #update values in states_cond matrix 
        self.states_cond[:, 1] -= recovered 

          
        #reward only when task done 
        reward = self.get_reward()

        #update the value map
        copyValueMap = np.copy(self.valueMap)
        deltaState = [0]*self.states
        for state in range(self.states):
            value = np.zeros((self.states, ))
            value += reward+(self.gamma*self.valueMap[state, self.get_discrete_int(self.action_list[state])])
            deltaState = np.append(deltaState, np.abs(copyValueMap[state, self.get_discrete_int(self.action_list[state])]-value[state]))
            copyValueMap[state, self.get_discrete_int(self.action_list[state])]= value[state]
        valueMap = copyValueMap
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode': self.curr_step,
                                                    'change': deltaState}
    
    def get_reward(self):
        for i in range(self.states):
            cpu = 1
            reward = [0]*self.states
            reward[i] = self.rr[i]*math.exp(-cpu*self.received[i])
        reward = sum(reward)
        return reward 


   
    def close(self):
        pass 

locations = 5
episodes = 50
total_drugs_qty = 10000
#create an instance of the class 
env = StatesEnv(locations, episodes, total_drugs_qty)

action = [16.66, 16.66, 16.66, 16.66, 16.66, 16.66]
delta = []


obs = env.reset()
for ep in range(episodes):
    obs, reward, done, info = env.step(action)
    delta = np.append(delta, info)
    if ep%10 == 0:
        print("Episode {}".format(ep+1))        
        print("obs=", obs, "reward=", reward, "done=", done)        
    
#print(delta)
#     plt.figure(figsize=(20, 10))
#     plt.rcParams.update({'figure.max_open_warning': 0})
#     if done: 
#             print("Done:)")
#             break
# for l in range(locations):    
#     plt.subplot(locations, 1, l+1)
#     plt.plot([5,10,15,20,25,30,35,40,45,50], delta[l], 'b-')
# plt.show()


env.close()




Distribution set:  [1, 13, 1, 8, 3, 57.34]
reserved quantity:  5734.0
recovery rate due to drug:  [0.00043347840410590743, 0.01489561610559846, 0.0019553400336318485, 0.02104709287029729, 0.0005604019950311023]
recovery rate(before):  [0.00458127 0.00249569 0.01078795 0.03602089 0.        ]
Episode 1
obs= [[1.15346000e+05 1.86140000e+04 4.33478404e-04 1.12970000e+04]
 [4.36370000e+04 1.04150000e+04 1.48956161e-02 3.08000000e+02]
 [2.55710000e+04 5.83500000e+03 1.95534003e-03 2.01000000e+02]
 [1.90050000e+04 4.35700000e+03 2.10470929e-02 2.36000000e+02]
 [2.67665000e+05 1.07813000e+05 5.60401995e-04 3.65000000e+02]] reward= 2.885061675459257e-134 done= False
Distribution set:  [1, 1, 1, 1, 1, 37.66]
reserved quantity:  3765.9999999999995
recovery rate due to drug:  [0.00043347840410590743, 0.001145816623507574, 0.0019553400336318485, 0.002630886608787161, 0.00018680066501036743]
recovery rate(before):  [0.00043348 0.01489562 0.00195534 0.02104709 0.0005604 ]
Distribution set:  [1, 1, 1,

## Run 5

In [6]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Active cases                                         0           Inf
        2	Recovery Rate = 100 - Death Rate=Recovered/Confirmed 0           Inf
        3	Population Density                                   0           Inf
        4	Projected Cases (to be introduced later by ABM)      0           Inf
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros(4)
        high = np.ones(4) * np.inf
        self.observation_space = spaces.Box(low, high, dtype = np.float32)
        #actions are vectors of the form [n1, n2, n3,...nk, r] for k states and r reserved amount of drug 
        self.action_space = spaces.Box(low = np.zeros((s+1, ), dtype = int), high = np.array([100]*(s+1)), shape = (s + 1, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.rr = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.80
        self.epsilon = 0.4
       
       
        
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(125096,15288, 0.0008669568082118149, 11297 ),
                              (50379,11760,0.016041432729106034, 308),
                              (31373,8052,0.01368738023542294, 201),
                              (24095,7082,0.01841620626151013, 236),
                              (327031,132538,0.0007472026600414697, 365)])
                               # Confirmed   Active  Recovery Rate(due to effect of drug) Population Density
                               # Delhi, Guj, Raja, MP, Maha
        #store the actions in an array 
        self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        #Declare the Value table 
        self.valueMap = np.zeros((self.states, 100))
        return self.states_cond, self.action_list
        

    def step(self, action):
        """
        Assumptions:
        1. Drug has 50% efficacy 
        2. Vaccine is passive, not antigen based- works to fight off existing infection.
        3. 1 person requires 1 vial (dose) only.
        4. No of confirmed and active cases in one particular region is constant, until we integrate the ABM projections model. 
        So, for the time being, recovery rate will always increase when drug is supplied to a particular state.
        
    """
        
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
                    
        #start with equal distribution 
        if self.curr_step == 1:
            self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
                action[i] = np.random.randint(0, 100/(self.states+1))
            reserved = 100-sum(action)
            action[self.states] = reserved
        else:
            for i in range(self.states):
                action[i] = np.argmax(self.valueMap[i])+1 
            reserved = 100 - sum(action) 
            action[self.states] = reserved

        #update action_list to store only the most recently used action values 
        self.action_list = action
        print("Distribution set: ",self.action_list)
        

        #no of units distrbuted to respective states 
        # received = []        
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
        reserved_qty = self.total*self.action_list[self.states]/100
        print("reserved quantity: ", reserved_qty)
        
        
        #simulation
        recovered = [0]*self.states
        
        for i in range(self.states):
            recovered[i] = 0.5*self.get_discrete_int(self.received[i])  #50% efficacy
            self.rr[i] = recovered[i]/self.states_cond[i][0] 
        print("recovery rate due to drug: ",self.rr)
        self.states_cond = np.array(self.states_cond)
        print("recovery rate(before): ", self.states_cond[:,2])       #recovery rate for the 'i'th state                         
        self.states_cond[:, 2] = self.rr                            #update values in states_cond matrix 
        self.states_cond[:, 1] -= recovered 

          
        #reward only when task done 
        reward = self.get_reward()

        #update the value map
        copyValueMap = np.copy(self.valueMap)
        deltaState = [0]*self.states
        for state in range(self.states):
            value = np.zeros((self.states, ))
            value += reward+(self.gamma*self.valueMap[state, self.get_discrete_int(self.action_list[state])])
            deltaState = np.append(deltaState, np.abs(copyValueMap[state, self.get_discrete_int(self.action_list[state])]-value[state]))
            copyValueMap[state, self.get_discrete_int(self.action_list[state])]= value[state]
        valueMap = copyValueMap
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode': self.curr_step,
                                                    'change': deltaState}
    
    def get_reward(self):
        for i in range(self.states):
            cpu = 1
            reward = [0]*self.states
            reward[i] = self.rr[i]*math.exp(-cpu*self.received[i])
        reward = sum(reward)
        return reward 


   
    def close(self):
        pass 

locations = 5
episodes = 50
total_drugs_qty = 10000
#create an instance of the class 
env = StatesEnv(locations, episodes, total_drugs_qty)

action = [16.66, 16.66, 16.66, 16.66, 16.66, 16.66]
delta = []


obs = env.reset()
for ep in range(episodes):
    obs, reward, done, info = env.step(action)
    delta = np.append(delta, info)
    print("Episode {}".format(ep+1))        
    print("obs=", obs, "reward=", reward, "done=", done)        
    
#print(delta)
#     plt.figure(figsize=(20, 10))
#     plt.rcParams.update({'figure.max_open_warning': 0})
#     if done: 
#             print("Done:)")
#             break
# for l in range(locations):    
#     plt.subplot(locations, 1, l+1)
#     plt.plot([5,10,15,20,25,30,35,40,45,50], delta[l], 'b-')
# plt.show()


env.close()




Distribution set:  [4, 8, 2, 11, 7, 51.34]
reserved quantity:  5134.000000000001
recovery rate due to drug:  [0.0015987721429941806, 0.007939816193255126, 0.0031874541803461574, 0.022826312512969495, 0.001070234931856613]
recovery rate(before):  [0.00086696 0.01604143 0.01368738 0.01841621 0.0007472 ]
Episode 1
obs= [[1.25096000e+05 1.50880000e+04 1.59877214e-03 1.12970000e+04]
 [5.03790000e+04 1.13600000e+04 7.93981619e-03 3.08000000e+02]
 [3.13730000e+04 7.95200000e+03 3.18745418e-03 2.01000000e+02]
 [2.40950000e+04 6.53200000e+03 2.28263125e-02 2.36000000e+02]
 [3.27031000e+05 1.32188000e+05 1.07023493e-03 3.65000000e+02]] reward= 1.0552170253938984e-307 done= False
Distribution set:  [3, 3, 5, 1, 3, 33.66]
reserved quantity:  3365.9999999999995
recovery rate due to drug:  [0.0011990791072456354, 0.0029774310724706723, 0.007968635450865394, 0.002075119319360863, 0.00045867211365283414]
recovery rate(before):  [0.00159877 0.00793982 0.00318745 0.02282631 0.00107023]
Episode 2
obs= [[

## Run 6

In [7]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Active cases                                         0           Inf
        2	Recovery Rate = 100 - Death Rate=Recovered/Confirmed 0           Inf
        3	Population Density                                   0           Inf
        4	Projected Cases (to be introduced later by ABM)      0           Inf
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros(4)
        high = np.ones(4) * np.inf
        self.observation_space = spaces.Box(low, high, dtype = np.float32)
        #actions are vectors of the form [n1, n2, n3,...nk, r] for k states and r reserved amount of drug 
        self.action_space = spaces.Box(low = np.zeros((s+1, ), dtype = int), high = np.array([100]*(s+1)), shape = (s + 1, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.rr = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.80
        self.epsilon = 0.4
       
       
        
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(130606,11904, 0.005995395536228177, 11297 ),
                              (55822,13131,0.007939816193255126, 308),
                              (35909,9935,0.022312179262423102, 201),
                              (27800,7857,0.03112678979041295, 236),
                              (375799,148905,0.0012231256364075578, 365)])
                               # Confirmed   Active  Recovery Rate(due to effect of drug) Population Density
                               # Delhi, Guj, Raja, MP, Maha
        #store the actions in an array 
        self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        #Declare the Value table 
        self.valueMap = np.zeros((self.states, 100))
        return self.states_cond, self.action_list
        

    def step(self, action):
        """
        Assumptions:
        1. Drug has 50% efficacy 
        2. Vaccine is passive, not antigen based- works to fight off existing infection.
        3. 1 person requires 1 vial (dose) only.
        4. No of confirmed and active cases in one particular region is constant, until we integrate the ABM projections model. 
        So, for the time being, recovery rate will always increase when drug is supplied to a particular state.
        
    """
        
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
                    
        #start with equal distribution 
        if self.curr_step == 1:
            self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
                action[i] = np.random.randint(0, 100/(self.states+1))
            reserved = 100-sum(action)
            action[self.states] = reserved
        else:
            for i in range(self.states):
                action[i] = np.argmax(self.valueMap[i])+1 
            reserved = 100 - sum(action) 
            action[self.states] = reserved

        #update action_list to store only the most recently used action values 
        self.action_list = action
        print("Distribution set: ",self.action_list)
        

        #no of units distrbuted to respective states 
        # received = []        
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
        reserved_qty = self.total*self.action_list[self.states]/100
        print("reserved quantity: ", reserved_qty)
        
        
        #simulation
        recovered = [0]*self.states
        
        for i in range(self.states):
            recovered[i] = 0.5*self.get_discrete_int(self.received[i])  #50% efficacy
            self.rr[i] = recovered[i]/self.states_cond[i][0] 
        print("recovery rate due to drug: ",self.rr)
        self.states_cond = np.array(self.states_cond)
        print("recovery rate(before): ", self.states_cond[:,2])       #recovery rate for the 'i'th state                         
        self.states_cond[:, 2] = self.rr                            #update values in states_cond matrix 
        self.states_cond[:, 1] -= recovered 

          
        #reward only when task done 
        reward = self.get_reward()

        #update the value map
        copyValueMap = np.copy(self.valueMap)
        deltaState = [0]*self.states
        for state in range(self.states):
            value = np.zeros((self.states, ))
            value += reward+(self.gamma*self.valueMap[state, self.get_discrete_int(self.action_list[state])])
            deltaState = np.append(deltaState, np.abs(copyValueMap[state, self.get_discrete_int(self.action_list[state])]-value[state]))
            copyValueMap[state, self.get_discrete_int(self.action_list[state])]= value[state]
        valueMap = copyValueMap
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode': self.curr_step,
                                                    'change': deltaState}
    
    def get_reward(self):
        for i in range(self.states):
            cpu = 1
            reward = [0]*self.states
            reward[i] = self.rr[i]*math.exp(-cpu*self.received[i])
        reward = sum(reward)
        return reward 


   
    def close(self):
        pass 

locations = 5
episodes = 50
total_drugs_qty = 10000
#create an instance of the class 
env = StatesEnv(locations, episodes, total_drugs_qty)

action = [16.66, 16.66, 16.66, 16.66, 16.66, 16.66]
delta = []


obs = env.reset()
for ep in range(episodes):
    obs, reward, done, info = env.step(action)
    delta = np.append(delta, info)
    print("Episode {}".format(ep+1))        
    print("obs=", obs, "reward=", reward, "done=", done)        
    
#print(delta)
#     plt.figure(figsize=(20, 10))
#     plt.rcParams.update({'figure.max_open_warning': 0})
#     if done: 
#             print("Done:)")
#             break
# for l in range(locations):    
#     plt.subplot(locations, 1, l+1)
#     plt.plot([5,10,15,20,25,30,35,40,45,50], delta[l], 'b-')
# plt.show()


env.close()




Distribution set:  [14, 9, 6, 6, 15, 33.34]
reserved quantity:  3334.0000000000005
recovery rate due to drug:  [0.005359631257369493, 0.008061337823797069, 0.008354451530257039, 0.01079136690647482, 0.001995747726843339]
recovery rate(before):  [0.0059954  0.00793982 0.02231218 0.03112679 0.00122313]
Episode 1
obs= [[1.30606000e+05 1.12040000e+04 5.35963126e-03 1.12970000e+04]
 [5.58220000e+04 1.26810000e+04 8.06133782e-03 3.08000000e+02]
 [3.59090000e+04 9.63500000e+03 8.35445153e-03 2.01000000e+02]
 [2.78000000e+04 7.55700000e+03 1.07913669e-02 2.36000000e+02]
 [3.75799000e+05 1.48155000e+05 1.99574773e-03 3.65000000e+02]] reward= 0.0 done= False
Distribution set:  [1, 1, 1, 1, 1, 61.66]
reserved quantity:  6166.0
recovery rate due to drug:  [0.00038283080409782095, 0.0008957042026441188, 0.001392408588376173, 0.0017985611510791368, 0.00013304984845622262]
recovery rate(before):  [0.00535963 0.00806134 0.00835445 0.01079137 0.00199575]
Episode 2
obs= [[1.30606000e+05 1.11540000e+04 3

## Run 7

In [8]:
import numpy as np 
import gym 
from gym import spaces 
import matplotlib.pyplot as plt 
import math
import random
# Stable Baselines only supports tensorflow 1.x for now
#jupyter notebook 
# import sys 
# !{sys.executable} -m pip install tensorflow==1.15.0
#google colab 
# %tensorflow_version 1.x

# !pip install stable-baselines[mpi]==2.10.0

class StatesEnv(gym.Env):
    """
    Customised Environment that follows gym interface.
    Describes relevant properties of the state and action spaces. 
    """
    metadata = {'render.modes':['human']}
    
    
    def __init__(self, s, episodes, total):
        """ 
        Observation:
        Type: Box(5)
                                                                Min         Max
        0	Confirmed Cases                                      0           Inf
        1	Active cases                                         0           Inf
        2	Recovery Rate = 100 - Death Rate=Recovered/Confirmed 0           Inf
        3	Population Density                                   0           Inf
        4	Projected Cases (to be introduced later by ABM)      0           Inf
        
    Actions:
    Type: Box (s+1)
    List of length (s+1)
    
    """
        self.states = s #no of independent simulations to be run 
        low = np.zeros(4)
        high = np.ones(4) * np.inf
        self.observation_space = spaces.Box(low, high, dtype = np.float32)
        #actions are vectors of the form [n1, n2, n3,...nk, r] for k states and r reserved amount of drug 
        self.action_space = spaces.Box(low = np.zeros((s+1, ), dtype = int), high = np.array([100]*(s+1)), shape = (s + 1, ), dtype = np.float)
        
        self.curr_step = 0
        self.done = False
        self.valueMap = np.zeros((self.states, 100))
        self.total = total #total number of vials available in 1 batch = batch size 
        self.episodes = episodes
        self.received = [0]*self.states
        self.rr = [0]*self.states
        self.states_cond = []
        self.action_list = []
        self.gamma = 0.80
        self.epsilon = 0.4
       
       
        
    def get_discrete_int(self, n):
        discrete_int = int(n)
        return discrete_int

    def reset(self):
        """
        Resets observation_space to a matrix initialising situation of states wrt the current figures; 
        action_space tp start exploring from the point of equal distribution between all states.
        """
        self.curr_step = 0
        self.done = False
        self.total = 10000
        # Declare the Initial Conditions for the States
       
        self.states_cond =  np.array([(135598,10705, 0.0042111388450760305, 11297 ),
                              (61438,14090,0.009852746229085307, 308),
                              (41298,11589,0.01810131164889025, 201),
                              (31806,8668,0.02697841726618705, 236),
                              (422118,150966,0.0005321993938248905, 365)])
                               # Confirmed   Active  Recovery Rate(due to effect of drug) Population Density
                               # Delhi, Guj, Raja, MP, Maha
        #store the actions in an array 
        self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        #Declare the Value table 
        self.valueMap = np.zeros((self.states, 100))
        return self.states_cond, self.action_list
        

    def step(self, action):
        """
        Assumptions:
        1. Drug has 50% efficacy 
        2. Vaccine is passive, not antigen based- works to fight off existing infection.
        3. 1 person requires 1 vial (dose) only.
        4. No of confirmed and active cases in one particular region is constant, until we integrate the ABM projections model. 
        So, for the time being, recovery rate will always increase when drug is supplied to a particular state.
        
    """
        
        if self.states_cond is None:
            raise Exception("You need to reset() the environment before calling step()!")
        
        # check if we're done
        if self.curr_step >= self.episodes - 1:
            self.done = True
                    
        #start with equal distribution 
        if self.curr_step == 1:
            self.action_list = np.array([100/(self.states+1)]*(self.states+1))
        
        #exploration vs exploitation        
        if random.uniform(0, 1) < self.epsilon:
            for i in range(self.states):
                action[i] = np.random.randint(0, 100/(self.states+1))
            reserved = 100-sum(action)
            action[self.states] = reserved
        else:
            for i in range(self.states):
                action[i] = np.argmax(self.valueMap[i])+1 
            reserved = 100 - sum(action) 
            action[self.states] = reserved

        #update action_list to store only the most recently used action values 
        self.action_list = action
        print("Distribution set: ",self.action_list)
        

        #no of units distrbuted to respective states 
        # received = []        
        for i in range(self.states):
            self.received[i] = self.total*self.action_list[i]/100
        reserved_qty = self.total*self.action_list[self.states]/100
        print("reserved quantity: ", reserved_qty)
        
        
        #simulation
        recovered = [0]*self.states
        
        for i in range(self.states):
            recovered[i] = 0.5*self.get_discrete_int(self.received[i])  #50% efficacy
            self.rr[i] = recovered[i]/self.states_cond[i][0] 
        print("recovery rate due to drug: ",self.rr)
        self.states_cond = np.array(self.states_cond)
        print("recovery rate(before): ", self.states_cond[:,2])       #recovery rate for the 'i'th state                         
        self.states_cond[:, 2] = self.rr                            #update values in states_cond matrix 
        self.states_cond[:, 1] -= recovered 

          
        #reward only when task done 
        reward = self.get_reward()

        #update the value map
        copyValueMap = np.copy(self.valueMap)
        deltaState = [0]*self.states
        for state in range(self.states):
            value = np.zeros((self.states, ))
            value += reward+(self.gamma*self.valueMap[state, self.get_discrete_int(self.action_list[state])])
            deltaState = np.append(deltaState, np.abs(copyValueMap[state, self.get_discrete_int(self.action_list[state])]-value[state]))
            copyValueMap[state, self.get_discrete_int(self.action_list[state])]= value[state]
        valueMap = copyValueMap
        

        # increment episode
        self.curr_step += 1


        return self.states_cond, reward, self.done, {'action_list': self.action_list, 'episode': self.curr_step,
                                                    'change': deltaState}
    
    def get_reward(self):
        for i in range(self.states):
            cpu = 1
            reward = [0]*self.states
            reward[i] = self.rr[i]*math.exp(-cpu*self.received[i])
        reward = sum(reward)
        return reward 


   
    def close(self):
        pass 

locations = 5
episodes = 50
total_drugs_qty = 10000
#create an instance of the class 
env = StatesEnv(locations, episodes, total_drugs_qty)

action = [16.66, 16.66, 16.66, 16.66, 16.66, 16.66]
delta = []


obs = env.reset()
for ep in range(episodes):
    obs, reward, done, info = env.step(action)
    delta = np.append(delta, info)
    print("Episode {}".format(ep+1))        
    print("obs=", obs, "reward=", reward, "done=", done)        
    
#print(delta)
#     plt.figure(figsize=(20, 10))
#     plt.rcParams.update({'figure.max_open_warning': 0})
#     if done: 
#             print("Done:)")
#             break
# for l in range(locations):    
#     plt.subplot(locations, 1, l+1)
#     plt.plot([5,10,15,20,25,30,35,40,45,50], delta[l], 'b-')
# plt.show()


env.close()




Distribution set:  [4, 6, 9, 15, 8, 41.34]
reserved quantity:  4134.000000000001
recovery rate due to drug:  [0.001474948008082715, 0.004882971450893584, 0.010896411448496296, 0.023580456517638182, 0.0009476023292065252]
recovery rate(before):  [0.00421114 0.00985275 0.01810131 0.02697842 0.0005322 ]
Episode 1
obs= [[1.35598000e+05 1.05050000e+04 1.47494801e-03 1.12970000e+04]
 [6.14380000e+04 1.37900000e+04 4.88297145e-03 3.08000000e+02]
 [4.12980000e+04 1.11390000e+04 1.08964114e-02 2.01000000e+02]
 [3.18060000e+04 7.91800000e+03 2.35804565e-02 2.36000000e+02]
 [4.22118000e+05 1.50566000e+05 9.47602329e-04 3.65000000e+02]] reward= 0.0 done= False
Distribution set:  [1, 1, 1, 1, 1, 53.66]
reserved quantity:  5366.0
recovery rate due to drug:  [0.00036873700202067877, 0.0008138285751489306, 0.001210712383166255, 0.0015720304345092121, 0.00011845029115081565]
recovery rate(before):  [0.00147495 0.00488297 0.01089641 0.02358046 0.0009476 ]
Episode 2
obs= [[1.35598000e+05 1.04550000e+04 3