diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 21f9e9e..c423c9c 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0 +current_version = 0.1.1 commit = True tag = True @@ -7,11 +7,6 @@ tag = True search = __version__ = "{current_version}" replace = __version__ = "{new_version}" - [bumpversion:file:pyproject.toml] search = version="{current_version}" replace = version="{new_version}" - -[bumpversion:file:setup.py] -search = version="{current_version}" -replace = version="{new_version}" \ No newline at end of file diff --git a/SevenWondersEnv/SevenWonEnv/envs/gameLog.txt b/SevenWondersEnv/SevenWonEnv/envs/gameLog.txt deleted file mode 100644 index ebdc8e9..0000000 --- a/SevenWondersEnv/SevenWonEnv/envs/gameLog.txt +++ /dev/null @@ -1,37 +0,0 @@ -SETUP COMPLETE -Player 1 with wonders Halicarnassus -Player 2 with wonders Alexandria -Player 3 with wonders Babylon -Player 4 with wonders Olympia -PLAYER 1 play timber yard -PLAYER 2 play workshop -PLAYER 3 play excavation -PLAYER 4 play step 1 -PLAYER 1 resource wood 0 clay 0 ore 0 stone 0 papyrus 0 glass 0 loom 1 compass 0 wheel 0 tablet 0 shield 0 -PLAYER 2 resource wood 0 clay 0 ore 0 stone 0 papyrus 0 glass 1 loom 0 compass 0 wheel 1 tablet 0 shield 0 -PLAYER 3 resource wood 0 clay 1 ore 0 stone 0 papyrus 0 glass 0 loom 0 compass 0 wheel 0 tablet 0 shield 0 -PLAYER 4 resource wood 1 clay 0 ore 0 stone 0 papyrus 0 glass 0 loom 0 compass 0 wheel 0 tablet 0 shield 0 -PLAYER 1 play clay pit -PLAYER 2 play stone pit -PLAYER 3 play ore vein -PLAYER 4 play step 2 -PLAYER 1 resource wood 0 clay 0 ore 0 stone 0 papyrus 0 glass 0 loom 1 compass 0 wheel 0 tablet 0 shield 0 -PLAYER 2 resource wood 0 clay 0 ore 0 stone 1 papyrus 0 glass 1 loom 0 compass 0 wheel 1 tablet 0 shield 0 -PLAYER 3 resource wood 0 clay 1 ore 1 stone 0 papyrus 0 glass 0 loom 0 compass 0 wheel 0 tablet 0 shield 0 -PLAYER 4 resource wood 1 clay 0 ore 0 stone 0 papyrus 0 glass 0 loom 0 compass 0 wheel 0 tablet 0 shield 0 -PLAYER 1 play apothecary -PLAYER 2 play ore vein -PLAYER 3 play lumber yard -PLAYER 4 play loom -PLAYER 1 resource wood 0 clay 0 ore 0 stone 0 papyrus 0 glass 0 loom 1 compass 1 wheel 0 tablet 0 shield 0 -PLAYER 2 resource wood 0 clay 0 ore 1 stone 1 papyrus 0 glass 1 loom 0 compass 0 wheel 1 tablet 0 shield 0 -PLAYER 3 resource wood 1 clay 1 ore 1 stone 0 papyrus 0 glass 0 loom 0 compass 0 wheel 0 tablet 0 shield 0 -PLAYER 4 resource wood 1 clay 0 ore 0 stone 0 papyrus 0 glass 0 loom 1 compass 0 wheel 0 tablet 0 shield 0 -PLAYER 1 play step 1 -PLAYER 2 play guard tower -PLAYER 3 play glassworks -PLAYER 4 play clay pool -PLAYER 1 resource wood 0 clay 0 ore 0 stone 0 papyrus 0 glass 0 loom 1 compass 1 wheel 0 tablet 0 shield 0 -PLAYER 2 resource wood 0 clay 0 ore 1 stone 1 papyrus 0 glass 1 loom 0 compass 0 wheel 1 tablet 0 shield 1 -PLAYER 3 resource wood 1 clay 1 ore 1 stone 0 papyrus 0 glass 1 loom 0 compass 0 wheel 0 tablet 0 shield 0 -PLAYER 4 resource wood 1 clay 1 ore 0 stone 0 papyrus 0 glass 0 loom 1 compass 0 wheel 0 tablet 0 shield 0 diff --git a/SevenWondersEnv/SevenWonEnv/envs/main.py b/SevenWondersEnv/SevenWonEnv/envs/main.py deleted file mode 100644 index 9386df7..0000000 --- a/SevenWondersEnv/SevenWonEnv/envs/main.py +++ /dev/null @@ -1,259 +0,0 @@ -import gymnasium as gym -import os -import math -import random -import numpy as np -from itertools import count -import SevenWonEnv -from SevenWonEnv.envs.mainGameEnv import Personality -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F -from collections import namedtuple, deque - -import matplotlib -import matplotlib.pyplot as plt - -from model import DQNModel - -Transition = namedtuple("Transition", ("state", "action", "next_state", "reward")) -TransitionWithoutReward = namedtuple("TransitionWithoutReward", ("state", "action", "next_state")) -BATCH_SIZE = 128 -GAMMA = 0.999 -# Decaying epsilon greedy. -EpsGreed_START = 0.9 -EpsGreed_END = 0.05 -EpsGreed_DECAY = 2500 -TARGET_UPDATE = 10 -DISCOUNTED_RATE = 0.95 -steps = 0 -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -class memoryNoReward(object): - def __init__(self, capacity): - self.memory = deque([], maxlen=capacity) - - def push(self, *args): - """Save a transition""" - self.memory.append(TransitionWithoutReward(*args)) - - def sample(self, batch_size): - return random.sample(self.memory, batch_size) - - def __len__(self): - return len(self.memory) - - -class ReplayMemory(object): - def __init__(self, capacity): - self.memory = deque([], maxlen=capacity) - - def push(self, *args): - """Save a transition""" - self.memory.append(Transition(*args)) - - def sample(self, batch_size): - return random.sample(self.memory, batch_size) - - def __len__(self): - return len(self.memory) - - -class DQN(nn.Module): - def __init__(self, obs, act): - super(DQN, self).__init__() - self.classifier = nn.Sequential( - nn.Linear(obs, 140), - nn.LayerNorm(140), - nn.Linear(140, 200), - nn.LayerNorm(200), - nn.Linear(200, 300), - ) - - def forward(self, x): - x = x.to(device) - x = x.view(x.size(0), -1) - x = self.classifier(x) - return x - - -if __name__ == "__main__": - print(torch.cuda.is_available()) - for env in gym.registry: - print(env) - env = gym.make("SevenWonderEnv", player=4) - # Add Players - personalityList = [] - personalityList.append(Personality.DQNAI) - for i in range(1, 4): - personalityList.append(Personality.RandomAI) - env.setPersonality(personalityList) - - # set up matplotlib - is_ipython = "inline" in matplotlib.get_backend() - if is_ipython: - from IPython import display - - plt.ion() - n_actions = env.action_space.n - n_observationsVec = env.observation_space.nvec - n_observationsLen = len(n_observationsVec) - poliNet = DQN(n_observationsLen, n_actions).to(device) - tarNet = DQN(n_observationsLen, n_actions).to(device) - maxNumber = 0 - # for file in os.listdir("ModelDict"): - # if file.endswith(".pt") and file.startswith("poli"): - # number = int(file[7:-3]) - # if number > maxNumber: - # maxNumber = number - # if maxNumber > 0: - # poliNet.load_state_dict(torch.load(os.path.join("ModelDict", "poliNet" + str(maxNumber) + ".pt"))) - # tarNet.load_state_dict(torch.load(os.path.join("ModelDict", "tarNet" + str(maxNumber) + ".pt"))) - prevEp = maxNumber - poliNet.train() - tarNet.load_state_dict(poliNet.state_dict()) - tarNet.train() - optimizer = optim.RMSprop(poliNet.parameters()) - memory = ReplayMemory(500000) - steps = 0 - - def select_action(state, possibleAction): - global steps - sample = random.random() - EpsGreed_threshold = EpsGreed_END + (EpsGreed_START - EpsGreed_END) * math.exp(-1.0 * steps / EpsGreed_DECAY) - steps += 1 - if sample > EpsGreed_threshold: - with torch.no_grad(): - output = poliNet(state) - output -= output.min(1, keepdim=True)[0] - output /= output.max(1, keepdim=True)[0] - mask = torch.zeros_like(output) - for action in possibleAction: - mask[0][action] = 1 - output *= mask - # print(output) - return output.max(1)[1].view(1, 1) - else: - return torch.tensor([[random.choice(possibleAction)]], device=device, dtype=torch.long) - - episode_reward = [] - mean_reward = [] - - def plotReward(): - plt.figure(2) - plt.clf() - rewards_t = torch.tensor(episode_reward, dtype=torch.float) - plt.title("Episode {}".format(len(episode_reward) + prevEp)) - plt.xlabel("Episode") - plt.ylabel("Reward") - xRange = range(prevEp, len(rewards_t) + prevEp) - plt.plot(xRange, rewards_t.numpy()) - # Take 100 episode averages and plot them too - if len(rewards_t) >= 100: - means = rewards_t.unfold(0, 100, 1).mean(1).view(-1) - means = torch.cat((torch.zeros(99), means)) - plt.plot(xRange, means.numpy()) - if len(episode_reward) % 100 == 0: - graphPath = os.path.join("Graph", str(len(episode_reward)) + ".png") - plt.savefig(graphPath) - plt.pause(0.01) # pause a bit so that plots are updated - - if is_ipython: - display.clear_output(wait=True) - display.display(plt.gcf()) - - def optimize_model(): - if len(memory) < BATCH_SIZE: - return - transitions = memory.sample(BATCH_SIZE) - batch = Transition(*zip(*transitions)) - non_final_mask = torch.tensor( - tuple(map(lambda s: s is not None, batch.next_state)), - device=device, - dtype=torch.bool, - ) - non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) - state_batch = torch.cat(batch.state) - action_batch = torch.cat(batch.action) - reward_batch = torch.cat(batch.reward) - state_action_values = poliNet(state_batch).gather(1, action_batch) - next_state_values = torch.zeros(BATCH_SIZE, device=device) - next_state_values[non_final_mask] = tarNet(non_final_next_states).max(1)[0].detach() - # Compute the expected Q values - expected_state_action_values = (next_state_values * GAMMA) + reward_batch - - # Compute Huber loss - criterion = nn.SmoothL1Loss() - loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1)) - - # Optimize the model - optimizer.zero_grad() - loss.backward() - for param in poliNet.parameters(): - param.grad.data.clamp_(-1, 1) - optimizer.step() - - num_episodes = 20000 - prevEp = 0 - if not os.path.exists("ModelDict"): - os.makedirs("ModelDict") - os.makedirs("ModelDict/poliNet") - os.makedirs("ModelDict/tarNet") - for i_episode in range(prevEp, num_episodes): - if i_episode % 200 == 0: - torch.save( - poliNet.state_dict(), - os.path.join("ModelDict", "poliNet" + str(i_episode) + ".pt"), - ) - torch.save( - tarNet.state_dict(), - os.path.join("ModelDict", "tarNet" + str(i_episode) + ".pt"), - ) - print("Episode : {}".format(i_episode)) - # Initialize the environment and state - NPState = env.reset() - state = torch.from_numpy(NPState.reshape(1, NPState.size).astype(np.float32)) - subMemory = [] - for t in count(): - # Select and perform an action - possibleAction = env.legalAction(1) - # print(possibleAction) - action = select_action(state, possibleAction) - # print("STEP ACtION",action) - # print("PLAY" + str(action.item())) - newNPState, reward, done, info = env.step() - reward = torch.tensor([reward], device=device) - newState = torch.from_numpy(newNPState.reshape(1, newNPState.size).astype(np.float32)) - # Store the transition in memory - subMemory.append([state, action, newState]) - # print("T" + str(t) + " REWARD " + str(reward)) - # Move to the next state - state = newState - - # Perform one step of the optimization (on the policy network) - optimize_model() - if done: - subMemLen = len(subMemory) - for i in range(subMemLen): - mem = subMemory[i] - memory.push( - mem[0], - mem[1], - mem[2], - (DISCOUNTED_RATE ** (subMemLen - i)) * reward, - ) - # print("reward" + str(reward)) - episode_reward.append(reward) - plotReward() - break - # Update the target network, copying all weights and biases in DQN - if i_episode % TARGET_UPDATE == 0: - tarNet.load_state_dict(poliNet.state_dict()) - env.reset() - env.render() - env.close() - plt.ioff() - # plt.show(block = False) - # check_env(env) diff --git a/SevenWondersEnv/SevenWonEnv/envs/mainGameEnv/loggers.txt b/SevenWondersEnv/SevenWonEnv/envs/mainGameEnv/loggers.txt deleted file mode 100644 index e69de29..0000000 diff --git a/SevenWondersEnv/SevenWonEnv/envs/model.py b/SevenWondersEnv/SevenWonEnv/envs/model.py deleted file mode 100644 index e488d54..0000000 --- a/SevenWondersEnv/SevenWonEnv/envs/model.py +++ /dev/null @@ -1,65 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F -from collections import namedtuple, deque - - -class DQNModel: - def __init__(self): - self.Transition = namedtuple("Transition", ("state", "action", "next_state", "reward")) - self.TransitionWithoutReward = namedtuple("TransitionWithoutReward", ("state", "action", "next_state")) - self.BATCH_SIZE = 128 - self.GAMMA = 0.999 - # Decaying epsilon greedy. - self.EpsGreed_START = 0.9 - self.EpsGreed_END = 0.05 - self.EpsGreed_DECAY = 2500 - self.TARGET_UPDATE = 10 - self.DISCOUNTED_RATE = 0.95 - self.steps = 0 - - class memoryNoReward(object): - def __init__(self, capacity): - self.memory = deque([], maxlen=capacity) - - def push(self, *args): - """Save a transition""" - self.memory.append(TransitionWithoutReward(*args)) - - def sample(self, batch_size): - return random.sample(self.memory, batch_size) - - def __len__(self): - return len(self.memory) - - class ReplayMemory(object): - def __init__(self, capacity): - self.memory = deque([], maxlen=capacity) - - def push(self, *args): - """Save a transition""" - self.memory.append(Transition(*args)) - - def sample(self, batch_size): - return random.sample(self.memory, batch_size) - - def __len__(self): - return len(self.memory) - - class DQN(nn.Module): - def __init__(self, obs, act): - super(DQN, self).__init__() - self.classifier = nn.Sequential( - nn.Linear(obs, 140), - nn.LayerNorm(140), - nn.Linear(140, 200), - nn.LayerNorm(200), - nn.Linear(200, 300), - ) - - def forward(self, x): - x = x.to(device) - x = x.view(x.size(0), -1) - x = self.classifier(x) - return x diff --git a/SevenWondersEnv/_version.py b/SevenWondersEnv/_version.py index 3dc1f76..485f44a 100644 --- a/SevenWondersEnv/_version.py +++ b/SevenWondersEnv/_version.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.1.1" diff --git a/pyproject.toml b/pyproject.toml index 8443e9d..02f6016 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ name = "7Wonder-RL-Lib" authors = [{name = "Phudis Dawieang", email = "pd2654@columbia.edu"}] description="Gymnasium Environment for the game Seven Wonders" readme = "README.md" -version = "0.1.0" +version="0.1.1" requires-python = ">=3.9" dependencies = ["gymnasium"]