In [7]:
from ast import Sub
from pylatex.utils import italic, bold, NoEscape
from pylatex import Document, Section, Subsection, Subsubsection, Description, Itemize, Command, Tabular, Math, TikZ, Axis, Plot, Figure, Matrix, Alignat, NewPage, NewLine
from pylatex.section import Paragraph, Subparagraph, Chapter
from pylatex.utils import italic
import numpy as np
from IPython.display import clear_output
# =================================[PAGE SETUP]=================================
geometry_options = { "margin": "1in", "includeheadfoot": False }
doc = Document(geometry_options=geometry_options, lmodern = True)
# ====================================[TITLE]===================================
doc.preamble.append(Command('title', NoEscape(r'Gameplaying AI: Reimplementing Proximal Policy Optimization\\ \large Introduction to Artificial Intelligence Project\\ \large ECS 170 Fall 2024')))
doc.preamble.append(Command('author', NoEscape(r'Darroll Saddi^\(1\), Andrew Yeow^\(1\), Christine Morayata^\(?\), Julia Heiler^\(?\), Ryan Li^\(1\), Steven Yi^\(?\)\\ \small^\(1\)University of California, Davis - Computer Science\\\small^\(2\)University of California, Davis - Cognitive Science')))
doc.preamble.append(Command('date', NoEscape(r'\today')))
doc.append(NoEscape(r'\maketitle'))
# ==================================[ABSTRACT]==================================
doc.append(NoEscape(r'\begin{abstract}This report documents a retrospective reimplementation of proximal policy optimization (PPO), performed at UC Davis for study purposes. Our goal was to not only use the algorithm to succesfully train a model to play Sonic the Hedgehog^\(TM\), but to additionally discover how to use, articulate, and implement reinforcement learning algorithms while being able to communicate PPO and reinforcement learning through practical experience. We also present this report as an insightful and educational resource for those interested in reinforcement learning and/or recreating our training environment.\end{abstract}'))
# ================================[INTRODUCTION]================================
with doc.create(Section('Introduction')):
    doc.append(NoEscape(r'Insert problem statement, motivation, and general introduction to the project.'))
    # doc.append(NoEscape(r'Proximal Policy Optimization (PPO) is a reinforcement learning algorithm that has been used to train agents in continuous action space environments, including video games. It is a policy gradient method that is designed to be simple to implement and computationally efficient. Being a on-policy method, the algorithm learns a policy to make decisions in the environment. This is different from an off-policy method, which learns the value of the optimal policy independently of the agent\'s action. In this project, we reimplemented PPO and used it to train an agent to play and complete levels in Sonic the Hedgehog^\(TM\). On a high level, PPO works by choosing an action for the agent to take, then observing the resultant state and reward. Then, an estimate for the advantage gain is computed (see GAE), which measures how much better the action was compared to the average action at that state. The policy is then updated using a special objective function to prevent the policy from updating too mcuh on a single episode by penalizing if the new policy deviates too much from the original policy, ensuring stability. These steps repeat until training is complete. Here is an example of a citation\footnote{This is a reference}. Here is another reference\footnote{This is another reference}. This is how I would cite the first reference again^\(1\).'))
    # More context on PPO needed above
    # TODO: Be sure to draw on multiple sources for this introduction. Explain motivation for project.
# ================================[BACKGROUND]==================================
with doc.create(Section('Background')):
    with doc.create(Subsection('Use-Cases')):
        doc.append(NoEscape(r'PPO is a versatile algorithm that can be used in a variety of environments. It is particularly well-suited for continuous state spaces, which makes it a good candidate for training agents in games like Sonic the Hedgehog^\(TM\).'))
        # TODO: Talk about the build-up to PPO, and how it excels over its predecessors.
    with doc.create(Subsection('PPO Implementation')):
        # TODO: PLS REWRITE + ADD REFERENCES
        with doc.create(Paragraph('Actor-Critic')):
            doc.append(NoEscape(r'PPO is an actor-critic algorithm, which means that it uses two neural networks to approximate the policy and value functions. The actor network takes the current state as input and outputs a probability distribution over possible actions. The critic network takes the current state as input and outputs an estimate of the value of the state. Both use the metric of "advantage" to update weights, which measures how much better taking a particular action is compared to the average action.'))
        with doc.create(Paragraph('Generalized Advantage Estimation (GAE)')):
            doc.append(NoEscape(r'This function \'how good is the current state\'. It is used to calculate the advantage of taking an action in a given state, which is then used to update the policy. The advantage is a measure of how much better an action is than the average action, and it is used to determine how to update the policy to improve performance.'))
        with doc.create(Paragraph('Surrogate Objective Function')):
            doc.append(NoEscape(r'The key to PPO is the surrogate objective function, which helps maximize the probability of taking an action that may eventually lead to high reward. Its main purpose is to keep updates within a trust region, using clipping (see below).'))
        with doc.create(Paragraph('Clipping')):
            doc.append(NoEscape(r'This is a technique used to prevent the policy from changing too much between updates. It is used to ensure that the policy does not change too much between updates, which would normally lead to training instability. It does this clipping the probability ratio between the new and old policies.'))
            # TODO: invest in some visualization
            # TODO: include an image that connects every part of the algorithm
            """here is just an example of what math things can be displayed using pylatex"""
            a = np.array([[100, 10, 20]]).T
            M = np.matrix([[2, 3, 4],
                            [0, 0, 1],
                            [0, 0, 2]])
            doc.append(Math(data=['2*3', '=', 9]))
            with doc.create(Tabular('rc|cl')) as table:
                table.add_hline()
                table.add_row((1, 2, 3, 4))
                table.add_hline(1, 2)
                table.add_empty_row()
                table.add_row((4, 5, 6, 7))
            doc.append(Math(data=[Matrix(M), Matrix(a), '=', Matrix(M * a)]))
            with doc.create(Alignat(numbering=False, escape=False)) as agn:
                agn.append(r'\frac{a}{b} &= 0 \\')
with doc.create(Section('Development Process')):
    with doc.create(Subsection('Environment')):
        with doc.create(Subsubsection('Framework')):
            doc.append(NoEscape(r"After early deliberation on the frameworks and games to work with, we decided upon Sonic the Hedgehog^\(TM\) as the game to train an agent on, and Stable-Retro, a ROM-loading and reinforcement learning toolkit. The game was chosen for its relative simplicity (gotta go fast!...to win), the fact that there preexisting benchmarks for games of its kind, and the fact that it is a continuous state game. Stable-Retro allows us to load and interact with the Sonic the Sonic the Hedgehog^\(TM\) ROM, as it provides a variety of preexisting tools for reading information from the game that is necessary for developing a RL training environment as well as providing reward to the agent. This approach allows us to focus on the setup for training and the training itself, removing the need for us to painstakingly target specific RAM values from the game process. Such vital information used to design the reward function and training restrictions include the current level, Sonic's position, and the number of lives."))
        with doc.create(Subsubsection('Wrappers')):
            doc.append(NoEscape(r'To simplify the training environment, Darroll used wrappers to preprocess a variety of features about the game state before it is fed into the algorithm. These wrappers are used to reduce the computational cost of training and to provide the agent with a more informative view of the environment. The following wrappers were implemented:'))
            with doc.create(Description()) as desc:
                desc.add_item("Rewarder", "Allows the specification of a custom reward function.")
                desc.add_item("Observer", "Feeds what is on the screen into a convolutional neural network to easily map the game state to agent, as opposed to manually reading the game state from the ROM.")
                desc.add_item("Frame Stacker", "Stacks the last 8 frames together to teach a mapping between current and previous states (extremely crucial, allows for learning of jumping timing/early jumping over obstacles).")
                desc.add_item("Action Mapper", "Discretize the action space to a limited number of one-hot-encoded actions, as opposed to completely random controller input combinations, decreasing input complexity which allows for faster learning.")
                desc.add_item("Frame Skipper", "Increase the tick speed of the game to allow for faster training.")
                desc.add_item("Multiprocess Vectorizer", "Allows for training multiprocessing, which speeds up training by allowing for multiple agents to investigate solutions to stage obstacles concurrently.")
    with doc.create(Subsection(('Reward Function'))):
        doc.append(NoEscape(r"Using OpenAI's Stable-Baselines3 PPO implementation, Darroll abstracted the algorithm to focus on the training environment and the reward function. For context, Stable-Baselines3 is a library that provides a variety of reinforcement learning algorithms, including PPO. I should also mention that the metric used to generally determine the success of the agent are the win/loss ratio and the average level completion time. Furthermore, any mention of the quality of the agent's problem-solving & exploration abilities were essentially measured by observing two things from trained models: 1) if agents learned to jump over obstacles, which essentially measures problem solving ability, and 2) if agents learned to complete a circular loop in the level, which measures exploration ability. The latter was an interesting problem to solve, which will be discussed later but varied drastically from model-to-model depending on hyperparameter values and the reward function."))
        with doc.create(Paragraph('Velocity')):
            doc.append(NoEscape(r"Initial tests with velocity-based rewards were unsuccessful. Calculated as delta x, velocity-based rewards led to reward farming as well as policies that preferred to move around in ways that acquired bursts of speed. Even after tuning the reward to account only for rightward velocity, the agent still failed to consistently beat the level. This may have been due to the fact that this reward had no relation to completing the level, which would would explain the agent's inability to learn how to do so."))
        with doc.create(Paragraph('Progress')):
            doc.append(NoEscape(r"Based on the failures of the previous reward function as well as on preexisting research, I moved on to a progress-based reward function. This function rewarded the agent for moving rightward progress through the level, which was a good heuristic. This reward function was successful in training the agent to complete the level, but it was not able to train the agent to complete the loop. It was at this point where I additionally implemented punishments (negative rewards) for the agent losing a life, as well as massive static rewards for completing the level. This reward function yielded good results, and the agent was able to at least make progress up to the loop. However, I observed the agents getting stuck in a local minimum, where they would repeatedly get suck at the loop and not learn how to complete it. It was clear I needed to tune the hyperparameters to increase the agent's exploration ability."))
    with doc.create(Subsection('Hyperparameter Tuning')):
        doc.append(NoEscape(r"A roadblock in development was figuring out good hyperparameters and reward magnitudes to train the model with. Depending on the values chosen, the outcomes varied greatly. What follows is a brief description of what values I converged upon, and my intuition for why they worked."))
        with doc.create(Description()) as desc:
            desc.add_item("Discount factor", "0.95, arbitrarily chosen from training standards")
            desc.add_item("Bias trade-off vs variance factor", "0.985, arbitrarily chosen from training standards")
            desc.add_item("Clipped surrogate objective", "0.75, arbitrarily chosen from training standards")
            desc.add_item("Entropy", "0.30, mild magnitude to encourage random actions to explore solutions to completing the loop, without being excessively high which prevented the agent from learning from successes.")
            desc.add_item("Learning Rate", "0.00075, values that were too high (0.1, 0.001, 0.0001) caused the agent to enter the exploitation phase of training too early, evident by the agent performing at a local maximum (preferring to get stuck at the loop to farm progress rewards during the restart). Values that were too low (0.00001) decreased learning speed without significant benefit, so a slightly higher value was chosen to increase speed while maintaining exploration.")
            desc.add_item("Timesteps before updating policy", "512, this value measures the number of timesteps (ticks) per agent before the policy is updated. 4096 was originally used, as I believed this would allow the agent longer stretches of forward progress with large amounts of positive reward. Although agents were able to beat certain areas of the map consistently after around 750,000 timesteps, I discovered that lowering the value to 512 decreased this to a mere 50,000-100,000 timesteps. I intepreted this as being due to each section of the map essentially now being solved in smaller pieces with 512, whereas 4096 considers much larger portions of the level as on huge problem to solve.")
            desc.add_item("Level complete reward", NoEscape("The third etc \\ldots"))
            desc.add_item("Life lost punishment", NoEscape("The third etc \\ldots"))
    with doc.create(Subsection('Completing the Loop')):
        doc.append(NoEscape(r""))
with doc.create(Section('GitHub')):
    doc.append(r"A repository containing the code used in this project can be found at:")
    doc.append(NewLine())
    doc.append(NewLine())
    doc.append(r"https://github.com/Iemontine/ProximalPolicyOptimization.")
    # TODO: move this
    with doc.create(Subsection('Cool graph')):
        with doc.create(Figure(position='h!')) as img:
            image_filename = "Pass1-3_500k_-0.01punishment_2.jpg"
            img.add_image(image_filename, width='450px')
            img.add_caption('Look at this photograph')
doc.append(NewPage())  # Insert a page break
with doc.create(Section('Contributions')):
    with doc.create(Subsection('Darroll Saddi')):
        doc.append(r'Set up training environment, including integrating libraries & frameworks. Performed inference- and intuition-based hyperparameter tuning to produce training results. Designed reward function & investigated multiple implementations including the multi-pass implementation. Implemented metric-tracking during training and wrote plotting code. Contributed to formal and GitHub documentation. Contributed research efforts and project direction.')
    with doc.create(Subsection('Andrew Yeow')):
        doc.append(r'Implemented proof of concept with DQL.')
    with doc.create(Subsection('Steven Yi')):
        doc.append(r"Refactored OpenAI's proximal policy optimization implementation to suit our purposes.")
    with doc.create(Subsection('Christine Morayata')):
        doc.append(r'Contributed to documentation.')
    with doc.create(Subsection('Julia Heiler')):
        doc.append(r'Contributed to documentation.')
    with doc.create(Subsection('Ryan Li')):
        doc.append(r'Contributed to documentation.')

while True:
    try:
        doc.generate_pdf('output')
        print("Completed!")
        break
    except:
        clear_output()
        pass

Completed!
