HokageM · HokageM · Nov 5, 2023 · Nov 5, 2023 · Nov 5, 2023 · Nov 5, 2023
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,8 @@ __pycache__/
 # C extensions
 *.so
 
+.idea/
+
 # Distribution / packaging
 .Python
 build/

diff --git a/README.md b/README.md
@@ -3,4 +3,27 @@ Inverse Reinforcement Learning Algorithm implementation with Pytorch.
 
 The implementation is based on: https://github.com/reinforcement-learning-kr/lets-do-irl
 
-Mountaincar experiment from: https://www.gymlibrary.dev/environments/classic_control/mountain_car/
+Mountaincar experiment from: https://www.gymlibrary.dev/environments/classic_control/mountain_car/
+
+# Installation
+
+```commandline
+cd IRLwPytorch
+pip install .
+```
+
+# Usage
+
+```commandline
+usage: irl [-h] [--version] [--training] [--testing] [--render]
+
+Implementation of IRL algorithms
+
+options:
+  -h, --help  show this help message and exit
+  --version   show program's version number and exit
+  --training  Enables training of model.
+  --testing   Enables testing of previously created model.
+  --render    Enables visualization of mountaincar.
+
+```
diff --git a/setup.cfg b/setup.cfg
@@ -76,8 +76,8 @@ testing =
 # console_scripts =
 #     script_name = irlwpytorch.module:function
 # For example:
-# console_scripts =
-#     fibonacci = irlwpytorch.skeleton:run
+console_scripts =
+     irl = irlwpytorch.main:run
 # And any other entry points, for example:
 # pyscaffold.cli =
 #     awesome = pyscaffoldext.awesome.extension:AwesomeExtension

diff --git a/src/irlwpytorch/MountainCar.py b/src/irlwpytorch/MountainCar.py
@@ -1,10 +1,76 @@
+import gym
+import numpy as np
+
 class MountainCar:
 
-    def __init__(self):
-        pass
+    def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamma):
+        if animation:
+            self.env = gym.make('MountainCar-v0', render_mode="human")
+        else:
+            self.env = gym.make('MountainCar-v0')
+        self.feature_matrix = feature_matrix
+        self.one_feature = one_feature
+        self.q_table = None
+        self.q_learning_rate = q_learning_rate
+        self.gamma = gamma
 
     def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         pass
+
+    def set_q_table(self, table):
+        self.q_table = table
+
+    def idx_demo(self, one_feature):
+        env_low = self.env.observation_space.low
+        env_high = self.env.observation_space.high
+        env_distance = (env_high - env_low) / self.one_feature
+
+        raw_demo = np.load(file="expert_demo/expert_demo.npy")
+        demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
+
+        for x in range(len(raw_demo)):
+            for y in range(len(raw_demo[0])):
+                position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
+                velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
+                state_idx = position_idx + velocity_idx * one_feature
+
+                demonstrations[x][y][0] = state_idx
+                demonstrations[x][y][1] = raw_demo[x][y][2]
+
+        return demonstrations
+
+    def idx_state(self, state):
+        env_low = self.env.observation_space.low
+        env_high = self.env.observation_space.high
+        env_distance = (env_high - env_low) / self.one_feature
+        position_idx = int((state[0] - env_low[0]) / env_distance[0])
+        velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
+        state_idx = position_idx + velocity_idx * self.one_feature
+        return state_idx
+
+    def idx_to_state(self, state):
+        """ Convert pos and vel about mounting car environment to the integer value"""
+        env_low = self.env.observation_space.low
+        env_high = self.env.observation_space.high
+        env_distance = (env_high - env_low) / self.one_feature
+        position_idx = int((state[0] - env_low[0]) / env_distance[0])
+        velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
+        state_idx = position_idx + velocity_idx * self.one_feature
+        return state_idx
+
+    def update_q_table(self, state, action, reward, next_state):
+        q_1 = self.q_table[state][action]
+        q_2 = reward + self.gamma * max(self.q_table[next_state])
+        self.q_table[state][action] += self.q_learning_rate * (q_2 - q_1)
+
+    def env_render(self):
+        self.env.render()
+
+    def env_reset(self):
+        return self.env.reset()
+
+    def env_step(self, action):
+        return self.env.step(action)
diff --git a/src/irlwpytorch/learning_curves/maxent_300.png b/src/irlwpytorch/learning_curves/maxent_300.png
diff --git a/src/irlwpytorch/learning_curves/maxent_30000.png b/src/irlwpytorch/learning_curves/maxent_30000.png
diff --git a/src/irlwpytorch/learning_curves/maxent_test_30000.png b/src/irlwpytorch/learning_curves/maxent_test_30000.png
diff --git a/src/irlwpytorch/main.py b/src/irlwpytorch/main.py
@@ -23,13 +23,12 @@
 import argparse
 import gym
 import matplotlib.pyplot as plt
-import numpy as np
 import logging
 import numpy as np
 import sys
 
-from MountainCar import MountainCar
-from MaxEntropyIRL import MaxEntropyIRL
+from .MountainCar import MountainCar
+from .MaxEntropyIRL import MaxEntropyIRL
 
 # from irlwpytorch import __version__
 
@@ -39,70 +38,9 @@
 
 _logger = logging.getLogger(__name__)
 
-n_states = 400  # position - 20, velocity - 20
-n_actions = 3
-one_feature = 20  # number of state per one feature
-q_table = np.zeros((n_states, n_actions))  # (400, 3)
-feature_matrix = np.eye((n_states))  # (400, 400)
-
-gamma = 0.99
-q_learning_rate = 0.03
-theta_learning_rate = 0.05
-
 np.random.seed(1)
 
 
-def idx_demo(env, one_feature):
-    env_low = env.observation_space.low
-    env_high = env.observation_space.high
-    env_distance = (env_high - env_low) / one_feature
-
-    raw_demo = np.load(file="expert_demo/expert_demo.npy")
-    demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
-
-    for x in range(len(raw_demo)):
-        for y in range(len(raw_demo[0])):
-            position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
-            velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
-            state_idx = position_idx + velocity_idx * one_feature
-
-            demonstrations[x][y][0] = state_idx
-            demonstrations[x][y][1] = raw_demo[x][y][2]
-
-    return demonstrations
-
-
-def idx_state(env, state):
-    env_low = env.observation_space.low
-    env_high = env.observation_space.high
-    env_distance = (env_high - env_low) / one_feature
-    position_idx = int((state[0] - env_low[0]) / env_distance[0])
-    velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
-    state_idx = position_idx + velocity_idx * one_feature
-    return state_idx
-
-
-def update_q_table(state, action, reward, next_state):
-    q_1 = q_table[state][action]
-    q_2 = reward + gamma * max(q_table[next_state])
-    q_table[state][action] += q_learning_rate * (q_2 - q_1)
-
-
-q_table = np.load(file="results/maxent_q_table.npy")  # (400, 3)
-one_feature = 20  # number of state per one feature
-
-
-def idx_to_state(env, state):
-    """ Convert pos and vel about mounting car environment to the integer value"""
-    env_low = env.observation_space.low
-    env_high = env.observation_space.high
-    env_distance = (env_high - env_low) / one_feature
-    position_idx = int((state[0] - env_low[0]) / env_distance[0])
-    velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
-    state_idx = position_idx + velocity_idx * one_feature
-    return state_idx
-
-
 def parse_args(args):
     """Parse command line parameters
 
@@ -119,6 +57,10 @@ def parse_args(args):
         action="version",
         # version=f"IRLwPytorch {__version__}",
     )
+    parser.add_argument('--training', action='store_true', help="Enables training of model.")
+    parser.add_argument('--testing', action='store_true',
+                        help="Enables testing of previously created model.")
+    parser.add_argument('--render', action='store_true', help="Enables visualization of mountaincar.")
     return parser.parse_args(args)
 
 
@@ -147,36 +89,51 @@ def main(args):
     args = parse_args(args)
     _logger.debug("Starting crazy calculations...")
 
-    car = MountainCar()
+    n_states = 400  # position - 20, velocity - 20
+    n_actions = 3
+    one_feature = 20  # number of state per one feature
+    feature_matrix = np.eye((n_states))  # (400, 400)
+
+    gamma = 0.99
+    q_learning_rate = 0.03
+    theta_learning_rate = 0.05
+
+    car = None
+    if args.render:
+        car = MountainCar(True, feature_matrix, one_feature, q_learning_rate, gamma)
+    else:
+        car = MountainCar(False, feature_matrix, one_feature, q_learning_rate, gamma)
 
     theta = -(np.random.uniform(size=(n_states,)))
     trainer = MaxEntropyIRL(feature_matrix, theta)
 
-    if False:
-        env = gym.make('MountainCar-v0', render_mode="human")
-        demonstrations = idx_demo(env, one_feature)
+    if args.training:
+        q_table = np.zeros((n_states, n_actions))  # (400, 3)
+        car.set_q_table(q_table)
+
+        demonstrations = car.idx_demo(one_feature)
 
         expert = trainer.expert_feature_expectations(demonstrations)
         learner_feature_expectations = np.zeros(n_states)
         episodes, scores = [], []
 
-        for episode in range(300):
-            state = env.reset()
+        for episode in range(30000):
+            state = car.env_reset()
             score = 0
 
-            if (episode != 0 and episode == 100) or (episode > 100 and episode % 50 == 0):
+            if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
                 learner = learner_feature_expectations / episode
                 trainer.maxent_irl(expert, learner, theta_learning_rate)
 
             state = state[0]
             while True:
-                state_idx = idx_state(env, state)
+                state_idx = car.idx_state(state)
                 action = np.argmax(q_table[state_idx])
-                next_state, reward, done, _, _ = env.step(action)
+                next_state, reward, done, _, _ = car.env_step(action)
 
                 irl_reward = trainer.get_reward(n_states, state_idx)
-                next_state_idx = idx_state(env, next_state)
-                update_q_table(state_idx, action, irl_reward, next_state_idx)
+                next_state_idx = car.idx_state(next_state)
+                car.update_q_table(state_idx, action, irl_reward, next_state_idx)
 
                 learner_feature_expectations += trainer.get_feature_matrix()[int(state_idx)]
 
@@ -187,28 +144,29 @@ def main(args):
                     episodes.append(episode)
                     break
 
-            if episode % 10 == 0:
+            if episode % 100 == 0:
                 score_avg = np.mean(scores)
                 print('{} episode score is {:.2f}'.format(episode, score_avg))
                 plt.plot(episodes, scores, 'b')
-                plt.savefig("./learning_curves/maxent_300.png")
-                np.save("./results/maxent_300_table", arr=q_table)
+                plt.savefig("./learning_curves/maxent_30000.png")
+                np.save("./results/maxent_30000_table", arr=q_table)
 
-    else:
-        env = gym.make('MountainCar-v0', render_mode="human")
+    if args.testing:
+        q_table = np.load(file="results/maxent_q_table.npy")  # (400, 3)
+        car.set_q_table(q_table)
 
         episodes, scores = [], []
 
         for episode in range(10):
-            state = env.reset()
+            state = car.env_reset()
             score = 0
 
             state = state[0]
             while True:
-                env.render()
-                state_idx = idx_to_state(env, state)
+                car.env_render()
+                state_idx = car.idx_to_state(state)
                 action = np.argmax(q_table[state_idx])
-                next_state, reward, done, _, _ = env.step(action)
+                next_state, reward, done, _, _ = car.env_step(action)
 
                 score += reward
                 state = next_state
@@ -217,7 +175,7 @@ def main(args):
                     scores.append(score)
                     episodes.append(episode)
                     plt.plot(episodes, scores, 'b')
-                    plt.savefig("./learning_curves/maxent_test_300.png")
+                    plt.savefig("./learning_curves/maxent_test_30000.png")
                     break
 
             if episode % 1 == 0:
@@ -235,14 +193,4 @@ def run():
 
 
 if __name__ == "__main__":
-    # ^  This is a guard statement that will prevent the following code from
-    #    being executed in the case someone imports this file instead of
-    #    executing it as a script.
-    #    https://docs.python.org/3/library/__main__.html
-
-    # After installing your project with pip, users can also run your Python
-    # modules as scripts via the ``-m`` flag, as defined in PEP 338::
-    #
-    #     python -m irlwpytorch.skeleton 42
-    #
     run()
diff --git a/src/irlwpytorch/results/maxent_30000_table.npy b/src/irlwpytorch/results/maxent_30000_table.npy
diff --git a/src/irlwpytorch/results/maxent_300_table.npy b/src/irlwpytorch/results/maxent_300_table.npy
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,8 @@ __pycache__/ @@
     # C extensions
     *.so
+    .idea/
     # Distribution / packaging
     .Python
     build/
@@ Expand Down @@