HokageM · HokageM · Nov 6, 2023 · Nov 5, 2023 · Nov 6, 2023
diff --git a/src/irlwpytorch/MountainCar.py b/src/irlwpytorch/MountainCar.py
@@ -1,9 +1,11 @@
 import gym
 import numpy as np
+import matplotlib.pyplot as plt
+
 
 class MountainCar:
 
-    def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamma):
+    def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer):
         if animation:
             self.env = gym.make('MountainCar-v0', render_mode="human")
         else:
@@ -13,6 +15,8 @@ def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamm
         self.q_table = None
         self.q_learning_rate = q_learning_rate
         self.gamma = gamma
+        self.n_states = n_states
+        self.trainer = trainer
 
     def __enter__(self):
         return self
@@ -42,15 +46,6 @@ def idx_demo(self, one_feature):
 
         return demonstrations
 
-    def idx_state(self, state):
-        env_low = self.env.observation_space.low
-        env_high = self.env.observation_space.high
-        env_distance = (env_high - env_low) / self.one_feature
-        position_idx = int((state[0] - env_low[0]) / env_distance[0])
-        velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
-        state_idx = position_idx + velocity_idx * self.one_feature
-        return state_idx
-
     def idx_to_state(self, state):
         """ Convert pos and vel about mounting car environment to the integer value"""
         env_low = self.env.observation_space.low
@@ -74,3 +69,71 @@ def env_reset(self):
 
     def env_step(self, action):
         return self.env.step(action)
+
+    def train(self, theta_learning_rate):
+        demonstrations = self.idx_demo(self.one_feature)
+
+        expert = self.trainer.expert_feature_expectations(demonstrations)
+        learner_feature_expectations = np.zeros(self.n_states)
+        episodes, scores = [], []
+
+        for episode in range(30000):
+            state = self.env_reset()
+            score = 0
+
+            if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
+                learner = learner_feature_expectations / episode
+                self.trainer.maxent_irl(expert, learner, theta_learning_rate)
+
+            state = state[0]
+            while True:
+                state_idx = self.idx_to_state(state)
+                action = np.argmax(self.q_table[state_idx])
+                next_state, reward, done, _, _ = self.env_step(action)
+
+                irl_reward = self.trainer.get_reward(self.n_states, state_idx)
+                next_state_idx = self.idx_to_state(next_state)
+                self.update_q_table(state_idx, action, irl_reward, next_state_idx)
+
+                learner_feature_expectations += self.trainer.get_feature_matrix()[int(state_idx)]
+
+                score += reward
+                state = next_state
+                if done:
+                    scores.append(score)
+                    episodes.append(episode)
+                    break
+
+            if episode % 100 == 0:
+                score_avg = np.mean(scores)
+                print('{} episode score is {:.2f}'.format(episode, score_avg))
+                plt.plot(episodes, scores, 'b')
+                plt.savefig("./learning_curves/maxent_30000.png")
+                np.save("./results/maxent_30000_table", arr=self.q_table)
+
+    def test(self):
+        episodes, scores = [], []
+
+        for episode in range(10):
+            state = self.env_reset()
+            score = 0
+
+            state = state[0]
+            while True:
+                self.env_render()
+                state_idx = self.idx_to_state(state)
+                action = np.argmax(self.q_table[state_idx])
+                next_state, reward, done, _, _ = self.env_step(action)
+
+                score += reward
+                state = next_state
+
+                if done:
+                    scores.append(score)
+                    episodes.append(episode)
+                    plt.plot(episodes, scores, 'b')
+                    plt.savefig("./learning_curves/maxent_test_30000.png")
+                    break
+
+            if episode % 1 == 0:
+                print('{} episode score is {:.2f}'.format(episode, score))
diff --git a/src/irlwpytorch/learning_curves/maxent_30000.png b/src/irlwpytorch/learning_curves/maxent_30000.png
diff --git a/src/irlwpytorch/learning_curves/maxent_test_30000.png b/src/irlwpytorch/learning_curves/maxent_test_30000.png
diff --git a/src/irlwpytorch/main.py b/src/irlwpytorch/main.py
@@ -1,36 +1,12 @@
-"""
-This is a skeleton file that can serve as a starting point for a Python
-console script. To run this script uncomment the following lines in the
-``[options.entry_points]`` section in ``setup.cfg``::
-
-    console_scripts =
-         fibonacci = irlwpytorch.skeleton:run
-
-Then run ``pip install .`` (or ``pip install -e .`` for editable mode)
-which will install the command ``fibonacci`` inside your current environment.
-
-Besides console scripts, the header (i.e. until ``_logger``...) of this file can
-also be used as template for Python modules.
-
-Note:
-    This file can be renamed depending on your needs or safely removed if not needed.
-
-References:
-    - https://setuptools.pypa.io/en/latest/userguide/entry_point.html
-    - https://pip.pypa.io/en/stable/reference/pip_install
-"""
-
 import argparse
-import gym
-import matplotlib.pyplot as plt
 import logging
 import numpy as np
 import sys
 
-from .MountainCar import MountainCar
-from .MaxEntropyIRL import MaxEntropyIRL
+from MountainCar import MountainCar
+from MaxEntropyIRL import MaxEntropyIRL
 
-# from irlwpytorch import __version__
+#from irlwpytorch import __version__
 
 __author__ = "HokageM"
 __copyright__ = "HokageM"
@@ -55,7 +31,7 @@ def parse_args(args):
     parser.add_argument(
         "--version",
         action="version",
-        # version=f"IRLwPytorch {__version__}",
+       # version=f"IRLwPytorch {__version__}",
     )
     parser.add_argument('--training', action='store_true', help="Enables training of model.")
     parser.add_argument('--testing', action='store_true',
@@ -92,103 +68,36 @@ def main(args):
     n_states = 400  # position - 20, velocity - 20
     n_actions = 3
     one_feature = 20  # number of state per one feature
-    feature_matrix = np.eye((n_states))  # (400, 400)
+    feature_matrix = np.eye(n_states)  # (400, 400)
 
     gamma = 0.99
     q_learning_rate = 0.03
     theta_learning_rate = 0.05
 
-    car = None
-    if args.render:
-        car = MountainCar(True, feature_matrix, one_feature, q_learning_rate, gamma)
-    else:
-        car = MountainCar(False, feature_matrix, one_feature, q_learning_rate, gamma)
-
     theta = -(np.random.uniform(size=(n_states,)))
     trainer = MaxEntropyIRL(feature_matrix, theta)
 
+    if args.render:
+        car = MountainCar(True, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer)
+    else:
+        car = MountainCar(False, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer)
+
     if args.training:
-        q_table = np.zeros((n_states, n_actions))  # (400, 3)
+        q_table = np.zeros((n_states, n_actions))
         car.set_q_table(q_table)
 
-        demonstrations = car.idx_demo(one_feature)
-
-        expert = trainer.expert_feature_expectations(demonstrations)
-        learner_feature_expectations = np.zeros(n_states)
-        episodes, scores = [], []
-
-        for episode in range(30000):
-            state = car.env_reset()
-            score = 0
-
-            if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
-                learner = learner_feature_expectations / episode
-                trainer.maxent_irl(expert, learner, theta_learning_rate)
-
-            state = state[0]
-            while True:
-                state_idx = car.idx_state(state)
-                action = np.argmax(q_table[state_idx])
-                next_state, reward, done, _, _ = car.env_step(action)
-
-                irl_reward = trainer.get_reward(n_states, state_idx)
-                next_state_idx = car.idx_state(next_state)
-                car.update_q_table(state_idx, action, irl_reward, next_state_idx)
-
-                learner_feature_expectations += trainer.get_feature_matrix()[int(state_idx)]
-
-                score += reward
-                state = next_state
-                if done:
-                    scores.append(score)
-                    episodes.append(episode)
-                    break
-
-            if episode % 100 == 0:
-                score_avg = np.mean(scores)
-                print('{} episode score is {:.2f}'.format(episode, score_avg))
-                plt.plot(episodes, scores, 'b')
-                plt.savefig("./learning_curves/maxent_30000.png")
-                np.save("./results/maxent_30000_table", arr=q_table)
+        car.train(theta_learning_rate)
 
     if args.testing:
-        q_table = np.load(file="results/maxent_q_table.npy")  # (400, 3)
+        q_table = np.load(file="./results/maxent_q_table.npy")  # (400, 3)
         car.set_q_table(q_table)
 
-        episodes, scores = [], []
-
-        for episode in range(10):
-            state = car.env_reset()
-            score = 0
-
-            state = state[0]
-            while True:
-                car.env_render()
-                state_idx = car.idx_to_state(state)
-                action = np.argmax(q_table[state_idx])
-                next_state, reward, done, _, _ = car.env_step(action)
-
-                score += reward
-                state = next_state
-
-                if done:
-                    scores.append(score)
-                    episodes.append(episode)
-                    plt.plot(episodes, scores, 'b')
-                    plt.savefig("./learning_curves/maxent_test_30000.png")
-                    break
-
-            if episode % 1 == 0:
-                print('{} episode score is {:.2f}'.format(episode, score))
+        car.test()
 
     _logger.info("Script ends here")
 
 
 def run():
-    """Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv`
-
-    This function can be used as entry point to create console scripts with setuptools.
-    """
     main(sys.argv[1:])
 
 

diff --git a/src/irlwpytorch/results/maxent_30000_table.npy b/src/irlwpytorch/results/maxent_30000_table.npy