In [1]:
#importing dependencies
import numpy as np
import random
import pandas as pd

## Initializing all global variables

In [2]:
number_of_agents = 3  #LT,MT,ST

In [3]:
number_of_epochs = 50
number_of_periods = 30 + 2

In [4]:
#List to store the global state at the end of each epoch
global_state_per_epoch = []

In [5]:
#action space contains of reducing or increasing the amount of tons CO2 consumed per capita
action_space = [
    -0.2, -0.16, -0.12, -0.08, -0.04, 0, 0.04, 0.08, 0.12, 0.16, 0.2
]
size_of_action_space = len(action_space)

In [6]:
cost_of_action = 5  #defining cost to reduce CO2 emissions per metric ton

In [7]:
#Q-tables
Q_LT = np.zeros((number_of_periods - 1, size_of_action_space))
Q_MT = np.zeros((number_of_periods - 1, size_of_action_space))
Q_ST = np.zeros((number_of_periods - 1, size_of_action_space))

#Q-tables for every epoch that stores all max values from
Q_LT_per_epoch = []
Q_MT_per_epoch = []
Q_ST_per_epoch = []

In [8]:
#defining best actions from Q-tables
Max_Q_LT = 0
Max_Q_MT = 0
Max_Q_ST = 0

In [9]:
#defining the weight factors of immediate rewards
LT_reward_factor = 0.4
MT_reward_factor = 0.5
ST_reward_factor = 0.6
cumulative_reward = 0  #initializing cumulative reward, which is 0 to start with

In [10]:
#creating a list to store the cumulative reward for each epoch
cumulative_reward_per_epoch = []

#creating a list to store the immediate rewards for each epoch
immediate_rewards_per_epoch = []

In [11]:
#epsilons are initialized within epoch.

#rate of decrement
LT_epsilon_decay = 0.95
MT_epsilon_decay = 0.9
ST_epsilon_decay = 0.85

#min value
LT_epsilon_min = 0.1
MT_epsilon_min = 0.06
ST_epsilon_min = 0.03

In [12]:
alpha = 0.1

In [13]:
#alpha <=> Learning rate (will be initialized within epoch)
alpha_min = 0.01  #min alpha
alpha_decay = 0.995  #rate of decrement

gamma = 0.7  #<=>reward discount

In [15]:
#run epoch loop
for epoch in range (1, number_of_epochs):
  
  #initializing variables which are reset for every new epoch
  global_state = 4.025 #1984 CO2 emissions (metric tons per capita)
  cumulative_reward = 0 #initializing cumulative reward, which is 0 to start with
  alpha = 0.1 #initializing the learning rate of the Q-values
  
  #initializing epsilons for each agent, the variable to balance exploration and exploitation, higher epsilon means more exploitation
  LT_epsilon = 0.9 
  MT_epsilon = 0.8 
  ST_epsilon = 0.7 
  
  #run period loop
  for period in range (0, number_of_periods-2):
    
    #initializing variables
    random_action = random.randrange(0,size_of_action_space)
    LT_action = action_space[random_action]
    random_action = random.randrange(0,size_of_action_space)
    MT_action = action_space[random_action]
    random_action = random.randrange(0,size_of_action_space)
    ST_action = action_space[random_action]
    
    #Immediate Reward functions
    LT_immediate_reward = -1 * LT_action * LT_reward_factor + cost_of_action * LT_action 
    MT_immediate_reward = -1 * MT_action * MT_reward_factor + cost_of_action * MT_action 
    ST_immediate_reward = -1 * ST_action * ST_reward_factor + cost_of_action * ST_action    
    
    #action iteration and Q-Table updates with exploration vs. exploitation
    #LT
    if np.random.rand() <= LT_epsilon:
      random_action = random.randrange(0,size_of_action_space)
      LT_action = action_space[random_action]
      Q_LT[period, random_action] = round((1-alpha) * Q_LT[period, random_action] + alpha * (LT_immediate_reward + gamma * np.amax(Q_LT[period + 1, :])), 2)
    else:
      LT_action = action_space[Max_Q_LT]
      Q_LT[period, Max_Q_LT] = round((1-alpha) * Q_LT[period, Max_Q_LT] + alpha * (LT_immediate_reward + gamma * np.amax(Q_LT[period + 1, :])), 2)
    
    #MT
    if np.random.rand() <= MT_epsilon:
      random_action = random.randrange(0,size_of_action_space)
      MT_action = action_space[random_action]
      Q_MT[period, random_action] = round((1-alpha) * Q_MT[period, random_action] + alpha * (MT_immediate_reward + gamma * np.amax(Q_MT[period + 1, :])), 2)
    else:
      MT_action = action_space[Max_Q_MT]
      Q_MT[period, Max_Q_MT] = round((1-alpha) * Q_MT[period, Max_Q_MT] + alpha * (LT_immediate_reward + gamma * np.amax(Q_MT[period + 1, :])), 2)
    
    #ST
    if np.random.rand() <= ST_epsilon:
      random_action = random.randrange(0,size_of_action_space)
      ST_action = action_space[random_action]
      Q_ST[period, random_action] = round((1-alpha) * Q_ST[period, random_action] + alpha * (ST_immediate_reward + gamma * np.amax(Q_ST[period + 1, :])), 2)
    else:
      ST_action = action_space[Max_Q_ST]
      Q_ST[period, Max_Q_ST] = round((1-alpha) * Q_ST[period, Max_Q_ST] + alpha * (ST_immediate_reward + gamma * np.amax(Q_ST[period + 1, :])), 2)
    
    #to be maximized: += immediate rewards of both agents + the negative of the actions that each agent takes, because the actions influence the global state, which shall be minimized
    cumulative_reward += LT_immediate_reward + MT_immediate_reward + ST_immediate_reward - 5*(action_space[np.argmax(Q_LT[period, :])]) - 5*(action_space[np.argmax(Q_MT[period, :])]) - 5*(action_space[np.argmax(Q_ST[period, :])])
    
    #decaying learning rate and agent's epsilon values
    alpha = alpha*alpha_decay if (alpha > alpha_min) else alpha
    LT_epsilon = LT_epsilon*LT_epsilon_decay if (LT_epsilon > LT_epsilon_min) else LT_epsilon
    MT_epsilon = MT_epsilon*MT_epsilon_decay if (MT_epsilon > MT_epsilon_min) else MT_epsilon
    ST_epsilon = ST_epsilon*ST_epsilon_decay if (ST_epsilon > ST_epsilon_min) else ST_epsilon
    
    #udpating global state and period counter
    global_state += (LT_action + MT_action + ST_action) / number_of_agents

#for each epoch, store the policies of each agent
  Q_LT_per_epoch.append(np.argmax(Q_LT, axis=1).tolist())
  Q_MT_per_epoch.append(np.argmax(Q_MT, axis=1).tolist())
  Q_ST_per_epoch.append(np.argmax(Q_ST, axis=1).tolist())
  
  #store the cumulative and immediate rewards and the global states of each epoch
  cumulative_reward_per_epoch.append(cumulative_reward)
  immediate_rewards_per_epoch.append((LT_immediate_reward, MT_immediate_reward, ST_immediate_reward))
  global_state_per_epoch.append(global_state)

In [18]:
#evaluating the trained model
print('\n')
print(
    f"Best Epoch based on Cumulative Reward: {np.argmax(cumulative_reward_per_epoch)}"
)
print(
    f"Highest Cumulative Reward: {round(np.amax(cumulative_reward_per_epoch), 2)}"
)
print(f"Best Epoch based on Global State: {np.argmin(global_state_per_epoch)}")
print(f"Lowest Global State: {round(np.min(global_state_per_epoch), 2)}")
print('\n')
print(
    f"Best Epoch based on LT's Immediate Reward: {np.argmax(immediate_rewards_per_epoch[:, 0])}"
)
print(
    f"Highest Immediate Reward for LT: {round(np.max(immediate_rewards_per_epoch[:, 0]), 2)}"
)
print(
    f"Best Epoch based on MT's Immediate Reward: {np.argmax(immediate_rewards_per_epoch[:, 1])}"
)
print(
    f"Highest Immediate Reward for MT: {round(np.max(immediate_rewards_per_epoch[:, 1]), 2)}"
)
print(
    f"Best Epoch based on ST's Immediate Reward: {np.argmax(immediate_rewards_per_epoch[:, 2])}"
)
print(
    f"Highest Immediate Reward for ST: {round(np.max(immediate_rewards_per_epoch[:, 2]), 2)}"
)

print('\n')



Best Epoch based on Cumulative Reward: 0
Highest Cumulative Reward: 85.82
Best Epoch based on Global State: 10
Lowest Global State: -0.92




TypeError: list indices must be integers or slices, not tuple

In [17]:
#build utilitarian, selfish and greedy policies of each agent
Q_LT_Best = Q_LT_per_epoch[np.argmax(cumulative_reward_per_epoch[:, 1]).tolist()]
Q_MT_Best = Q_MT_per_epoch[np.argmax(cumulative_reward_per_epoch[:, 1]).tolist()]
Q_ST_Best = Q_ST_per_epoch[np.argmax(cumulative_reward_per_epoch[:, 1]).tolist()]

Q_LT_Immediate_Best = Q_LT_per_epoch[np.argmax(immediate_rewards_per_epoch[:, 1]).tolist()]
Q_MT_Immediate_Best = Q_MT_per_epoch[np.argmax(immediate_rewards_per_epoch[:, 1]).tolist()]
Q_ST_Immediate_Best = Q_ST_per_epoch[np.argmax(immediate_rewards_per_epoch[:, 2]).tolist()]
                                                       
LT_Strategy = [action_space[i] for i in Q_LT_Best.astype(int)[1:]]
MT_Strategy = [action_space[i] for i in Q_MT_Best.astype(int)[1:]]
ST_Strategy = [action_space[i] for i in Q_ST_Best.astype(int)[1:]]

LT_Greedy_Strategy = [action_space[i] for i in Q_LT_Immediate_Best.astype(int)[1:]]
MT_Greedy_Strategy = [action_space[i] for i in Q_MT_Immediate_Best.astype(int)[1:]]
ST_Greedy_Strategy = [action_space[i] for i in Q_ST_Immediate_Best.astype(int)[1:]]

LT_Policy = [action_space[i] for i in np.argmax(Q_LT, axis=1)]
MT_Policy = [action_space[i] for i in np.argmax(Q_MT, axis=1)]
ST_Policy = [action_space[i] for i in np.argmax(Q_ST, axis=1)]

print(f"LT's Strategy to achieve Highest Cumulative Reward: \n {LT_Strategy}")
print(f"LT's Strategy to achieve Highest Immediate Reward: \n {LT_Greedy_Strategy}")
print(f"Selfish Policy of LT, based on LT's Final Q-Table: \n {LT_Policy}")
print('\n')

print(f"MT's Strategy to achieve Highest Cumulative Reward: \n {MT_Strategy}")
print(f"MT's Strategy to achieve Highest Immediate Reward: \n {MT_Greedy_Strategy}")
print(f"Selfish Policy of MT, based on MT's Final Q-Table: \n {MT_Policy}")
print('\n')

print(f"ST's Strategy to achieve Highest Cumulative Reward: \n {ST_Strategy}")
print(f"ST's Strategy to achieve Highest Immediate Reward: \n {ST_Greedy_Strategy}")
print(f"Selfish Policy of ST, based on ST's Final Q-Table: \n {ST_Policy}")
print('\n')

TypeError: list indices must be integers or slices, not tuple

In [21]:
print(f"Final Q-Table of LT: \n {action_count}")  #Q-Tables are printed...
print(Q_rowcount(Q_LT[:number_of_periods - 2]))
print('\n')

print(f"Final Q-Table of MT: \n {action_count}")  #Q-Tables are printed...
print(Q_rowcount(Q_MT[:number_of_periods - 2]))
print('\n')

print(
    f"Final Q-Table of ST: \n {action_count}"
)  #...Based on the "rewards_per_epoch" Table, the best Q-Tables are identified as policies
print(Q_rowcount(Q_ST[:number_of_periods - 2]))

NameError: name 'action_count' is not defined

In [22]:
#Under utilary Policies
#initializing variables which are reset for every new epoch
global_state = 4.97  #current CO2 emissions are at 4.97 metric tons per capita world-wide. Source: World Bank
cumulative_reward_selfish = 0  #initializing cumulative reward, which is 0 to start with
alpha = 0.1  #initializing the learning rate of the Q-values
Q_LT = np.zeros((number_of_periods - 1, size_of_action_space))
Q_MT = np.zeros((number_of_periods - 1, size_of_action_space))
Q_ST = np.zeros((number_of_periods - 1, size_of_action_space))

#run period loop
for period in range(0, number_of_periods - 2):

    #actions based on Policies
    LT_action = ST_Strategy[period]
    MT_action = MT_Strategy[period]
    ST_action = ST_Strategy[period]

    #Immediate Reward functions
    LT_immediate_reward = -1 * LT_action * LT_reward_factor + cost_of_action * LT_action  #defining immediate reward function of LT per period
    MT_immediate_reward = -1 * LT_action * MT_reward_factor + cost_of_action * MT_action  #defining immediate reward function of LT per period
    ST_immediate_reward = -1 * ST_action * ST_reward_factor + cost_of_action * ST_action  #defining immediate reward function of ST per period

    #to be maximized: += immediate rewards of both agents + the negative of the actions that each agent takes, because the actions influence the global state, which shall be minimized
    cumulative_reward_selfish += LT_immediate_reward + MT_immediate_reward + ST_immediate_reward - 5 * (
        action_space[np.argmax(Q_LT[period, :])]
    ) - 5 * (action_space[np.argmax(
        Q_MT[period, :])]) - 5 * (action_space[np.argmax(Q_ST[period, :])])

    global_state += (LT_action + MT_action + ST_action) / number_of_agents
    period += 1

print('\n')
print(
    f"Cumulative Reward under Selfish Policies: {round(cumulative_reward_selfish, 2)}"
)
print(f"Global State after using Selfish Policies: {round(global_state, 2)}")

Selfish_Reward_Loss = round(
    100 *
    (cumulative_reward_selfish - np.max(cumulative_reward_per_epoch[:, 1])) /
    np.max(cumulative_reward_per_epoch[:, 1]), 2)
CO2_Selfish = round(
    100 * ((global_state - np.min(global_state_per_epoch[:, 1])) /
           np.min(global_state_per_epoch[:, 1])), 2)

print('\n')
print(
    f"Percentage of Lower Cumulative Reward comparing Selfish Policies to Best Epoch: {Selfish_Reward_Loss}%"
)
print(
    f"Percentage of Higher Global State comparing Selfish Policies to Best Epoch: {CO2_Selfish}%"
)



Cumulative Reward under Selfish Policies: 23.22
Global State after using Selfish Policies: 0.01


Percentage of Lower Cumulative Reward comparing Selfish Policies to Best Epoch: -66.93%
Percentage of Higher Global State comparing Selfish Policies to Best Epoch: -100.97%


In [23]:
#Under Selfish Policies
#initializing variables which are reset for every new epoch
global_state = 4.97  #current CO2 emissions are at 4.97 metric tons per capita world-wide. Source: World Bank
cumulative_reward_selfish = 0  #initializing cumulative reward, which is 0 to start with
alpha = 0.1  #initializing the learning rate of the Q-values
Q_LT = np.zeros((number_of_periods - 1, size_of_action_space))
Q_MT = np.zeros((number_of_periods - 1, size_of_action_space))
Q_ST = np.zeros((number_of_periods - 1, size_of_action_space))

#run period loop
for period in range(0, number_of_periods - 2):

    #actions based on Policies
    LT_action = LT_Policy[period]
    MT_action = MT_Policy[period]
    ST_action = ST_Policy[period]

    #Immediate Reward functions
    LT_immediate_reward = -1 * LT_action * LT_reward_factor + cost_of_action * LT_action  #defining immediate reward function of LT per period
    MT_immediate_reward = -1 * LT_action * MT_reward_factor + cost_of_action * MT_action  #defining immediate reward function of LT per period
    ST_immediate_reward = -1 * ST_action * ST_reward_factor + cost_of_action * ST_action  #defining immediate reward function of ST per period

    #to be maximized: += immediate rewards of both agents + the negative of the actions that each agent takes, because the actions influence the global state, which shall be minimized
    cumulative_reward_selfish += LT_immediate_reward + MT_immediate_reward + ST_immediate_reward - 5 * (
        action_space[np.argmax(Q_LT[period, :])]
    ) - 5 * (action_space[np.argmax(
        Q_MT[period, :])]) - 5 * (action_space[np.argmax(Q_ST[period, :])])

    global_state += (LT_action + MT_action + ST_action) / number_of_agents
    period += 1

print('\n')
print(
    f"Cumulative Reward under Selfish Policies: {round(cumulative_reward_selfish, 2)}"
)
print(f"Global State after using Selfish Policies: {round(global_state, 2)}")

Selfish_Reward_Loss = round(
    100 *
    (cumulative_reward_selfish - np.max(cumulative_reward_per_epoch[:, 1])) /
    np.max(cumulative_reward_per_epoch[:, 1]), 2)
CO2_Selfish = round(
    100 * ((global_state - np.min(global_state_per_epoch[:, 1])) /
           np.min(global_state_per_epoch[:, 1])), 2)

print('\n')
print(
    f"Percentage of Lower Cumulative Reward comparing Selfish Policies to Best Epoch: {Selfish_Reward_Loss}%"
)
print(
    f"Percentage of Higher Global State comparing Selfish Policies to Best Epoch: {CO2_Selfish}%"
)



Cumulative Reward under Selfish Policies: 61.9
Global State after using Selfish Policies: 2.9


Percentage of Lower Cumulative Reward comparing Selfish Policies to Best Epoch: -11.84%
Percentage of Higher Global State comparing Selfish Policies to Best Epoch: -382.33%


In [25]:
#Under Greedy Policies
#initializing variables which are reset for every new epoch
global_state = 4.97  #current CO2 emissions are at 4.97 metric tons per capita world-wide. Source: World Bank
cumulative_reward_greedy = 0  #initializing cumulative reward, which is 0 to start with
alpha = 0.1  #initializing the learning rate of the Q-values

#run period loop
for period in range(0, number_of_periods - 2):

    #actions based on Policies
    LT_action = LT_Greedy_Strategy[period]
    MT_action = MT_Greedy_Strategy[period]
    ST_action = ST_Greedy_Strategy[period]

    #Immediate Reward functions
    LT_immediate_reward = -1 * LT_action * LT_reward_factor + cost_of_action * LT_action
    MT_immediate_reward = -1 * LT_action * MT_reward_factor + cost_of_action * MT_action
    ST_immediate_reward = -1 * ST_action * ST_reward_factor + cost_of_action * ST_action

    #to be maximized: += immediate rewards of both agents + the negative of the actions that each agent takes, because the actions influence the global state, which shall be minimized
    cumulative_reward_greedy += LT_immediate_reward + MT_immediate_reward + ST_immediate_reward - 5 * (
        action_space[np.argmax(Q_LT[period, :])]
    ) - 5 * (action_space[np.argmax(
        Q_MT[period, :])]) - 5 * (action_space[np.argmax(Q_ST[period, :])])

    global_state += (LT_action + MT_action + ST_action) / number_of_agents
    period += 1

print('\n')
print(
    f"Cumulative Reward under Greedy Policies: {round(cumulative_reward_greedy, 2)}"
)
print(f"Global State after using Greedy Policies: {round(global_state, 2)}")

Greedy_Reward_Loss = round(
    100 *
    (cumulative_reward_greedy - np.max(cumulative_reward_per_epoch[:, 1])) /
    np.max(cumulative_reward_per_epoch[:, 1]), 2)
CO2_Greedy = round(
    100 * ((global_state - np.min(global_state_per_epoch[:, 1])) /
           np.min(global_state_per_epoch[:, 1])), 2)

print('\n')
print(
    f"Percentage of Lower Cumulative Reward comparing Greedy Policies to Best Epoch: {Greedy_Reward_Loss}%"
)
print(
    f"Percentage of Higher Global State comparing Greedy Policies to Best Epoch: {CO2_Greedy}%"
)



Cumulative Reward under Greedy Policies: 30.44
Global State after using Greedy Policies: 0.6


Percentage of Lower Cumulative Reward comparing Greedy Policies to Best Epoch: -61.68%
Percentage of Higher Global State comparing Greedy Policies to Best Epoch: -155.85%


In [62]:
#showing period numbers in first column of Q-table
def Q_rowcount(Q_Table):
    subcount = 1
    for row in Q_Table:
        count = f"p{subcount}"
        print(count, row)
        subcount += 1