In [239]:
from ast import Sub
from math import e
from pylatex.utils import italic, bold, NoEscape
from pylatex import Document, Section, Subsection, Subsubsection, Description, Itemize, Command, Tabular, Math, TikZ, Axis, Plot, Figure, Matrix, Alignat, NewPage, NewLine, Enumerate, Center, Package
from pylatex.section import Paragraph, Subparagraph, Chapter
from pylatex.utils import italic
import numpy as np
from IPython.display import clear_output
# =================================[PAGE SETUP]=================================
geometry_options = { "margin": "0.8in", "includeheadfoot": False }
doc = Document(geometry_options=geometry_options, lmodern = True)
doc.packages.append(Package('amssymb'))
# ====================================[TITLE]===================================
doc.preamble.append(Command('title', NoEscape(r'Gameplaying AI: Implementing Proximal Policy Optimization\\ \large Introduction to Artificial Intelligence Project\\ \large ECS 170 Spring 2024')))
doc.preamble.append(Command('author', NoEscape(r'Darroll Saddi\textsuperscript{1}, Andrew Yeow\textsuperscript{1}, Christine Morataya\textsuperscript{2}, Julia Heiler\textsuperscript{1}, Ryan Li\textsuperscript{1}, Steven Yi\textsuperscript{1}\\ \small\textsuperscript{1}University of California, Davis - Computer Science\\\small\textsuperscript{2}University of California, Davis - Cognitive Science')))
doc.preamble.append(Command('date', NoEscape(r'\today')))
doc.append(NoEscape(r'\maketitle'))
# ==================================[ABSTRACT]==================================
doc.append(NoEscape(r'\begin{abstract}This report documents a retrospective reimplementation of proximal policy optimization (PPO), for study purposes. Our goal was to not only use the algorithm to successfully train an artificial intelligence to play a semi-complex 2D video game, but through this practical experience, discover how to use, articulate, and implement reinforcement learning (RL) algorithms like PPO. We also present this report as an insightful and educational resource for those interested in reinforcement learning, studying our design process, and/or recreating our training environment.\end{abstract}'))
# ================================[INTRODUCTION]================================
with doc.create(Section('Introduction')):
    doc.append(NoEscape(r'Reinforcement learning (RL) has emerged as a powerful paradigm for artificial intelligence and artificial learning. Proximal Policy Optimization (PPO), developed by OpenAI, is a notable advancement in this field, being a versatile algorithm that has shown significant promise across various domains, especially continuous state spaces. This report documents the extensive learning and implementation process for this algorithm. To facilitate training and algorithm visualization, our goal was to train an agent to quickly complete levels in a video game, in specific, Sonic the Hedgehog\textsuperscript{TM} for the Sega Genesis. We place a particular focus on optimizing reward functions and hyperparameters by analyzing the behaviors of trained agent.'))
    doc.append(NewLine())
    doc.append(NewLine())
    doc.append(NoEscape(r'Our objectives were to increase the broader community’s understanding of the RL training process/environment in addition to showing how an RL algorithm can be leveraged to intuitively find a solution to a problem. Video games and other artificial environments are an ideal environment to practice or test RL algorithms as they are easy to understand and analyze\footnote{e.g. agent is learning to go to the left when the goal is on the right → something is wrong with the reward function!}. Through this project, we aim to provide an insightful and educational resource for the practical challenges associated with utilizing and implementing PPO. By articulating and documenting our development process, challenges, and solutions, we hope to contribute to the ease of access and understanding of introductory resources for reinforcement learning, which is an area we believe requires further development.'))
    # TODO: Be sure to draw on multiple sources for this introduction. Explain motivation for project.
# ================================[BACKGROUND]==================================
with doc.create(Section('Background')):
    with doc.create(Subsection('Use-Cases')):
        doc.append(NoEscape(r'Proximal policy optimization\footnote{OpenAI, Proximal policy optimization, 2018, Documentation, https://spinningup.openai.com/en/latest/algorithms/ppo.html} (PPO) is a versatile reinforcement learning algorithm that can be used in a variety of simulated or emulated environments. It is particularly well-suited for continuous state spaces, which makes it a good candidate for training agents to play a video game.'))
        # TODO: Talk about the build-up to PPO, and how it excels over its predecessors.
    with doc.create(Subsection('PPO Algorithm')):
        # TODO: PLS REWRITE + ADD REFERENCES
        with doc.create(Paragraph('Actor-Critic')):
            doc.append(NoEscape(r'PPO is an actor-critic algorithm, which means that it uses two neural networks to approximate the policy and value functions. In actor-critic models, the actor controls the action the agent takes (by executing the policy) and the critic returns a value that represents how good or bad that action is in the current state. Both networks take the current state as input, however the actor network outputs a probability distribution over possible actions, whereas the critic network outputs an estimate of the value of the state. Both use the metric of \(advantage\) to update weights, which measures how much better taking a particular action is compared to the average action.'))
        with doc.create(Paragraph('Surrogate Objective Function')):
            doc.append(NoEscape(r'The key to applying policy updates in PPO is the surrogate objective function, which helps maximize the probability of taking an action that may eventually lead to high reward. Its main purpose is to keep policy updates within a trust region, using clipping within the range [1 - \(\epsilon\), 1 + \(\epsilon\)]. This ensures that the policy does not change too much between updates, which in turn improves training stability. The surrogate objective function is defined as:'))
            with doc.create(Alignat(numbering=False, escape=False)) as agn:
                agn.append(r"""L^{\text{CLIP}}(\theta) = \mathbb{E}_t \left[ \min \left( r_t(\theta) \hat{A}_t, \text{clip}(r_t(\theta), 1 - \epsilon, 1 + \epsilon) \hat{A}_t \right) \right]""")
        with doc.create(Paragraph('Probability Ratio')):
            doc.append(NoEscape(r'This ratio is how the algorithm compares the new and previous policy, by simply dividing the advantage given by the new policy by the advantage given by the old policy. Advantage is denoted as \(\pi\). A resultant value of less than 1 means the action \(a_t\) in the state \(s_t\) resulted is better (more likely) in the new policy tahn in the old one. Conversely, a value greater than 1 means the action is worse (less likely). It is defined as:'))
            with doc.create(Alignat(numbering=False, escape=False)) as agn:
                agn.append(NoEscape(r"""r_t(\theta) = \frac{\pi_{\theta}(a_t \mid s_t)}{\pi_{\theta_{old}}(a_t \mid s_t)}"""))
        with doc.create(Paragraph('Generalized Advantage Estimation (GAE)')):
            doc.append(NoEscape(r"GAE is used to calculate advantage, a measure of how much better (or wose) an action is compared to the expected return from the current policy, AKA the average action. At a high level, GAE significantly reduces variance while maintaining a tolerable level of bias, in turn stabilizing training of the agent. For posterity, the GAE equation is shown below, but will not be covered in further detail for simplicity:"))
            with doc.create(Alignat(numbering=False, escape=False)) as agn:
                agn.append(NoEscape(r"""Â_t = \sum_{\ell=0}^{\infty} (\gamma \lambda)^{\ell} \delta_{t+\ell}"""))
        with doc.create(Paragraph('Clipping')):
            doc.append(NoEscape(r"The key idea of PPO, clipping, prevents the policy from changing too much between updates, thereby improving training stability. This is accomplished by clipping the ratio of the new policy to the old policy within a specific range, [1 - $\epsilon$, 1 + $\epsilon$]. This range acts like a safety zone, ensuring that the updates are small and controlled. If the ratio goes outside this range, the update is clipped, or limited, so the change isn't too big. In short, clipping ensures limited, conservative updates to the model."))

            # TODO: invest in some visualization
            # TODO: include an image that connects every part of the algorithm
            """here is just an example of what math things can be displayed using pylatex"""
            # a = np.array([[100, 10, 20]]).T
            # M = np.matrix([[2, 3, 4],
            #                 [0, 0, 1],
            #                 [0, 0, 2]])
            # doc.append(Math(data=['2*3', '=', 9]))
            # with doc.create(Tabular('rc|cl')) as table:
            #     table.add_hline()
            #     table.add_row((1, 2, 3, 4))
            #     table.add_hline(1, 2)
            #     table.add_empty_row()
            #     table.add_row((4, 5, 6, 7))
            # doc.append(Math(data=[Matrix(M), Matrix(a), '=', Matrix(M * a)]))
with doc.create(Section('Development Process')):
    with doc.create(Subsection('Environment')):
        with doc.create(Subsubsection('Framework')):
            doc.append(NoEscape(r"After discovering Gym-Retro, which offers a variety of learning scenarios to choose from, Andrew developed a proof-of-concept of training with a different algorithm, Deep Q-Learning. However, due to the outdatedness of some of the libraries and all its dependencies, we opted to search for a different approach that supported our desired versions of Python, Pytorch, and Gymnasium\footnote{Farama Foundation, Gymnasium, (2024), Documentaton, https://gymnasium.farama.org/index.html} (environment creator/loader for RL). After deliberation on the frameworks and games to work with, we decided upon using Sonic the Hedgehog\textsuperscript{TM} as the game to train an agent on, and Stable-Retro, a video game ROM-loading framework\footnote{Farama Foundation, Stable-Retro, (2024), Documentaton, https://stable-retro.farama.org/}. This game was chosen for its relative simplicity (gotta go fast...to win), its intuitive and semi-complex continuous state space, and the fact that there are pre-existing benchmarks for games of its kind\footnote{Alex Nichol, Gotta Learn Fast: A New Benchmark for Generalization in RL, (2018), Report, https://arxiv.org/abs/1804.03720}. Stable-Retro allows us to load and interact with video game ROMs as the training environment, and provides a variety of pre-existing tools for reading information from retro video games. These libraries proved essential for developing our training environment, as they allowed us to design our own reward function and select and tune inputs to the convolutional neural networks, for exmaple. Not only do these libraries have better dependency support, this approach also allowed us to focus on the setup for training and the training itself, removing the need for us to painstakingly target specific RAM values from the game process. Such vital information used to design the reward function and training restrictions include the current level, Sonic’s position, and the number of lives."))
        with doc.create(Subsubsection('Wrappers')):
            doc.append(NoEscape(r'To simplify the training environment, I (Darroll) used wrappers to preprocess a variety of features about the game state before it is fed into the algorithm and/or neural networks. These wrappers are used to reduce the computational cost of training and to provide the agent with a more informative view of the environment. Moreover, some wrappers are used to reduce time complexity\footnote{Faster learning refers to a decrease in time needed for the agent to solve problems. Faster training refers to the actual processing power and time needed to train a model}. The following wrappers were implemented:'))
            with doc.create(Description()) as desc:
                desc.add_item("Rewarder", r"Allows the specification of a custom reward function, in addition to timing resets (e.g. if Sonic dies, go back to start of level, reset all tracked variables, administer negative reward).")
                desc.add_item("Observer", r"Feeds what is on the screen into a feature-extraction convolutional neural network to easily and visually map game states, as opposed to manually reading game states from RAM values. This proved to be crucial for training, as it allowed for agents to learn an association between map features and an action to respond to them with.")
                desc.add_item("Frame Stacker", r"Stacks the last 8 frames together to provide vectors between current and previous states to the CNN (crucial, allows for improved learning of jump timing, including early jumping over obstacles or gaps).")
                desc.add_item("Action Mapper", r"Discretizes the action space to a limited number of one-hot-encoded actions (e.g. move right, jump, move left), as opposed to completely random controller input combinations which is the default. This thereby decreases input complexity and allows for faster learning.")
                desc.add_item("Frame Skipper", r"Periodically skips game frames to allow for faster training, further reducing complexity as less frames are processed overall while still maintaining enough information for the agent to learn effectively.")
                desc.add_item("Multiprocess Vectorizer", r"Allows for training multiprocessing, which speeds up training by allowing for multiple agent instances to concurrently investigate solutions to problems.")
    with doc.create(Subsection(('Reward Function'))):
        doc.append(NoEscape(r"To preface, Stable-Baselines3 is a library that provides a variety of implementations of reinforcement learning algorithms, including PPO. In this project, we used OpenAI's Stable-Baselines3 version of PPO, allowing me to abstract the algorithm implementation within the context of its mathematical formulation. This further allowed me to focus on the training environment and the reward function."))
        with doc.create(Subsubsection('Note about Evaluation Metrics')):
            doc.append(NoEscape(r"I should also mention that the metric I used to estimate the success of the agent are the win/loss ratio and level completion time. Furthermore, any mention of the quality of the agent’s problem-solving \& exploration abilities were essentially measured by observing two things from trained models:"))
            with doc.create(Enumerate()) as enum:
                enum.add_item(NoEscape(r"Whether the agent learned to jump over obstacles, which I used to approximate the divide between exploration and exploitation phases of training\footnote{For context, exploration is the phase of training where the agent is learning to solve the problem, and exploitation is the phase where the agent is optimizing its solution. Research the concept of the Discount Factor for more information.}, and whether the transition was too late or early."))
                enum.add_item(r"Whether the agent learned to complete a circular loop in the middle of the level, which I used to measure exploration ability as it required the agent to maintain rightward momentum to solve.")
            doc.append(r"The latter was an interesting problem, and will be discussed later regarding drastic variations from model to model depending on hyperparameter values and the reward function.")
        with doc.create(Paragraph('Velocity')):
            doc.append(NoEscape(r"Initial tests with velocity-based rewards were unsuccessful. Calculated as $\Delta x$, where x is agent's x-coordinate in the level, velocity-based rewards led to reward farming\footnote{OpenAI, Gym-Retro, (2018), Article, https://openai.com/index/gym-retro/} as well as policies that preferred to move around in ways that acquired bursts of speed. Even after tuning the reward to account only for rightward velocity, the agent still failed to consistently beat the level. This may have been due to the fact that this reward had a small relation to the implicit task of completing the level, explaining the agent's inability to learn how to do so. In more abstract terms, failing to propose a proper problem statement for the agent seemed to lead to the problem being solved in unexpected ways."))
        with doc.create(Paragraph('Progress')):
            doc.append(NoEscape(r"Based on the failures of the previous reward function as well as on pre-existing research\footnote{Viet Nguyen, Sonic-PPO-pytorch, (2021), GitHub repository, https://github.com/uvipen/Sonic-PPO-pytorch}, I moved on to a progress-based reward function. This function rewards the agent for making rightward progress through the level, and is a better heuristic as it actually relates to the task of winning because the end goal is always somewhere on the right side of the level. This reward function successfully trained the agent to occasionally avoid some obstacles or enemies, but it was not able to train the agent to consistently complete the loop. It was at this point where I additionally implemented punishments (negative rewards) for the agent losing a life, as well as massive static rewards for completing the level. This reward function yielded good results, and the agent was able to at least make progress up to the loop. However, I observed the agents getting stuck in a local minimum by repeatedly jumping rightwards into the loop, and would never learn how to traverse it, as doing so required a conservation of momentum by holding the \textquoteleft move right\textquoteright \space button without jumping. I believe this was due to agents entering the exploitation phase of training too early, culminating in the agent repeating this behavior of jumping to the right that was successfully solving most obstacles before the loop. Furthermore, completing the loop with progress as the reward is problematic as completing it incurs leftward movement, where no reward would be given. It was clear I needed to tune the hyperparameters to increase the agent’s exploration ability, to not only have it learn to maintain its momentum but to additionally explore this segment of no-reward within the loop."))
            with doc.create(Figure(position='th!')) as fig:
                image_filename = "progress_visualization.png"
                fig.add_image(image_filename, width='180px')
                fig.add_caption('Visualization of the progress dynamic problem within the loop')
    with doc.create(Subsection('Hyperparameter Tuning')):
        doc.append(NoEscape(r"A roadblock in development was figuring out good hyperparameters and reward magnitudes to train the model with. Depending on the values chosen, even if varied by the small factor of 10, the outcomes varied greatly. At some point, I also spent time investigating using the Optuna library to perform automated hyperparameter tuning, but due to the sheer complexity and required length of training (several hours) per model, this approach was not feasible under time constraints as it would require multiple full training cycles to determine any optimized values. What follows is a brief description of what values I converged upon, and my intuition for why they worked."))
        with doc.create(Description()) as desc:
            desc.add_item("Discount factor", r"0.95, arbitrarily chosen from training standards")
            desc.add_item("Bias trade-off vs variance factor", r"0.985, arbitrarily chosen from training standards")
            desc.add_item("Clipped surrogate objective", r"0.75, arbitrarily chosen from training standards")
            desc.add_item("Entropy", r"0.30, mild magnitude to encourage random actions to explore solutions to completing the loop, without being excessively high which prevented the agent from learning from successes (observed when the agent jumps around randomly even after several hundred thousand timesteps).")
            desc.add_item("Learning Rate", r"0.00075, values that were too high (0.1-0.0001) caused the agent to enter the exploitation phase of training too early, evident by the agent performing at a local minimum where it would prefer to get stuck at the loop to farm progress rewards during the reset from taking too long. Values that were too low (0.00001) decreased learning speed without significant benefit, so a slightly higher value was chosen to increase speed while maintaining exploration ability as needed.")
            desc.add_item("Timesteps before updating policy", r"512, this value measures the number of timesteps that should pass before the policy is updated for all agents. 4096 was originally used, as I believed this would allow the agent to associate longer stretches of forward progress with large amounts of positive reward. Although agents learned to beat early areas of the map consistently after around 500,000 timesteps, I discovered that lowering the value to 512 decreased this to a mere 50,000-100,000 timesteps. I interpreted this as being due to each section of the map essentially now being solved in smaller pieces with 512, whereas with 4096, much larger portions of the level are presented to the neural network as larger problems.")
    with doc.create(Subsection('Multi-Pass Training')):
        with doc.create(Subsubsection('Adjusting Reward/Punishment Weights')):
            doc.append(NoEscape(r"At this point in development, I arbitrarily chose weights for agent progress rewards, death punishments, and level complete rewards. As it turns out however, these each had massive impacts on the quality of training and produced widely varying models. Depending on these magnitudes, policies could develop that never learned to complete the loop, opting again to farm the initial progress rewards from starting the level, or even in the worst-case of my observation, be completely unable to learn to jump over obstacles. Based on this, I lowered the progress rewards to be calculated as $\Delta x / 100$. This produces rewards as decimals in the range of 0.0 and 0.5. The resultant training was faring far better, and it seemed that keeping rewards low, but not insignificant, was important for agents to develop a mapping between making forward progress and getting closer to the true goal which is the end of the level. It follows that level completion reward and death punishment magnitudes converged on being set to 50 and -1 respectively, both being subject to change as I continued my optimization efforts."))
        with doc.create(Subsubsection('Multi-Pass')):
            doc.append(r"At this point, I was getting results, but they weren't satisfactory.")
            with doc.create(Figure(position='th!')) as fig:
                image_filename = "Pass1_15M_1.png"
                fig.add_image(image_filename, width='497px')
                fig.add_caption('Winrate, completion time progression after 1.5 million timesteps, one-shot training')
            doc.append(r"Figure 2 shows that by the end of training the model with the previous magnitudes, the winrate is less than half the time, even after a period of volatile failure that occurs from early training in which the likelihood of losing a life is very high. Furthermore when it is able to complete the level, the 5 lowest completion times are relatively low, which was good. However, the average time is very poor, at around 1 minute and 50 seconds, which means completion times are inconsistent given the lower and middle bounds. Keep in mind the world record completion times of this level by a human ranges between 18-25 seconds. This led me to develop a new multi-pass method, in which each pass of three passes utilizes a different reward function, working as follows:")
            with doc.create(Center()) as centered_table:
                with doc.create(Tabular('|c|c|c|c|c|')) as table:
                    table.add_hline()
                    table.add_row(NoEscape(r'\textbf{Pass \# (500k timesteps)}'), NoEscape(r'\textbf{Progress}'),  NoEscape(r'\textbf{Winning}'), NoEscape(r'\textbf{Death}'), NoEscape(r'\textbf{Lack of progress}'))
                    table.add_hline()
                    table.add_row('Pass 1 (original setup)', NoEscape(r'$\Delta x / 100$'), 50, '0', '0.00')
                    table.add_hline()
                    table.add_row('Pass 2', NoEscape(r'$\Delta x / 100$'), 50, '-1', '-0.01')
                    table.add_hline()
                    table.add_row('Pass 3', NoEscape(r'$\Delta x / 100$'), 'max(0, timesteps to win/1000)', '-2', '-0.02')
                    table.add_hline()
            doc.append(NoEscape(r"""At this point, I was skeptical of an implementation that 'hard-coded' a learning progression, as opposed training with an ideal reward function and ideal hyperparameters. I only took this approach because I thought at the time that punishing on no-progress was not possible due to initial failures with this method that turned out to be due to poor reward magnitudes and hyperparameter choices. My intuition for what was wrong, was that the setup lacked a heuristic emulating the learning curve a human might traverse when learning to beat a video game level quickly; the agents were able to learn to beat the level, but would rarely optimize for time, which I believed could be achieved by punishing on a lack of progress once the agent knew how to beat the level. While the multi-pass method proved to be unnecessary, here were the results:"""))
            with doc.create(Figure(position='th!')) as fig:
                image_filename = "multipass.png"
                fig.add_image(image_filename, width='497px')
                fig.add_caption('multi-pass implementation; winrate progression, winrate frequency, overall winrate')
            doc.append(r"The first chart in Figure 3 provides context on how each of the three pass's win percentages and frequencies compare. Additionally, by comparing the Winrate Progression charts from Figure 2 and Figure 3, it is clear that the performance of this 3-Pass implementation is a significant improvement. The winrate, stabilized after ignoring the volatile exploration phase (estimated to be within the first 600 wins/losses) is at least 82.94%. Compared to the 49.33% from the preceding setup, this is a 68.13% increase in winrate, which is quite significant. To prove this methodology is in any way ideal, I decided to attempt training 1.5M timesteps on each component Pass separately, in order to examine their performance and draw comparisons, here are the results:")
            with doc.create(Figure(position='th!')) as fig:
                image_filename = "punish_v_nopunish_winrates_levelcompletion.png"
                fig.add_image(image_filename, width='490px')
            with doc.create(Figure(position='th!')) as fig:
                image_filename = "punish_v_nopunish_wintimes.png"
                fig.add_image(image_filename, width='260px')
                fig.add_caption('Comparing results from training with and without punishing on lack of progress')
            doc.append(r"Resultant winrate from training while punishing lack of progress, compared to training in multiple passes with varying reward functions, proved to be slightly better for 1.5 million timesteps. Where multi-pass had winrate in the range of 65.86%-82.34%, a single pass with Pass 3's reward function, resulted in a winrate in the range of 77.20%-83.05% in the same training time; a slight improvement in the lower bound of winrate progression. Similarly, the wincount also went up, from 267+892+1072=2231 for multi-pass to a flat 3037 wins from the extended Pass 3. The fastest completion times seemed to converge to roughly the same efficiency between both methodologies. Finally, although resultant wintimes for both methods are roughly the same (31.5-31.8 seconds), the average was driven down more than 4 seconds. For easier comparison, the charts also include the results from Figure 2. Clearly, Pass 3's reward function is a better fit for the problem, and Pass 1's reward function proved to be detrimental to training, driving all the metrics down due to its weakness, especially when examining its inclusion during multi-pass. This is significant data, as it means that hard-coding a learning progression was completely unnecessary.")
with doc.create(Section('Conclusion')):
    with doc.create(Subsection('Results, Key Takeaways')):
        doc.append(NoEscape(r'Throughout the development of this project, I often intuited how to produce more efficient training by referencing how the agent was performing in the environment essentially by observing the agent’s behavior and adjusting hyperparameters accordingly. For example, I noticed that the agent was getting stuck at the loop, so I increased the entropy and decreased learning rate to encourage more exploration of the problem in addition to decreasing the number of timesteps before updating the policy to allow the agent to learn to beat the level in smaller pieces. Another example was how I designed the reward function. I often thought of how a human might learn to beat a level of a video game quickly. I imagined they would traverse a learning curve, first needing to learn to beat the level, then optimizing after that and self-punishing when dying or not moving faster. This led me to implement the multi-pass implementation that produced positive results. Although the multi-pass method proved to be generally worse than a single pass with a ideal reward function, this flow of thought was what led me back to this implementation in the first place. These realizations reminds us of The Bitter Lesson, from an article analyzed in this course that stresses the importance of avoiding the human-knowledge approach to AI, and to instead allow AI agents to learn for themselves\footnote{Rich Sutton, The Bitter Lesson, Article, (2019)} --- "We want AI-agents that can discover like we can, not which contain what we have discovered." It is very clear that attempting to implement human ingenuity should not be the focus of practioning AI, but rather the focus should be on the learning process itself.'))
    with doc.create(Subsection('Reinforcement Learning as True Learning')):
        doc.append(NoEscape(r"We have come to believe that reinforcement learning algorithms like PPO mimic true learning, likely going through the same learning curve that Darroll attempted to hard code, in its own way. These high-level, ‘black-box’ observations were crucial to the success of the project, as they allowed him to find intuitive learning heuristics, leading to the 83\% training accuracy of the model. It is also clear that the focus should undoubtedly be on training and learning, rather than attempting to hard-code any human-centric or human-knowledge-based implementations. We believe that similar simplifications and abstractions of the inner workings of algorithms must be taken in other reinforcement learning projects, as they allow one to place this focus on learning. This can be done without discrediting the importance of understanding the algorithms practioners are using, as this knowledge is still vital in determining and developing intuition as well as in gauging the weaknesses and strengths of an algorithm. We conclude that these are all a testament to the power of reinforcement learning as an algorithmic approximation to true, biological learning."))
with doc.create(Section('GitHub')):
    doc.append(r"A repository containing the code used in this project can be found at:")
    doc.append(NewLine())
    doc.append(NewLine())
    doc.append(r"https://github.com/Iemontine/ProximalPolicyOptimization.")
doc.append(NewPage())  # Insert a page break
with doc.create(Section('Contributions')):
    with doc.create(Subsection('Darroll Saddi')):
        doc.append(r'Set up training environment, including integrating libraries & frameworks. Performed inference- and intuition-based hyperparameter tuning to produce training results. Designed reward function, investigating and testing multiple implementation styles. Implemented metric-tracking during training and wrote plotting code. Wrote majority of write-up & GitHub documentation. Contributed reinforcement learning and model training research. Contributed to documentation & project direction; presentation, write-ups, slide deck.')
    with doc.create(Subsection('Julia Heiler')):
        doc.append(r'Contributed to documentation; presentation, write-ups, slide deck. Contributed writing majority of speech script, ensuring coherence and cohesion throughout. Contributed PPO research and project direction.')
    with doc.create(Subsection('Andrew Yeow')):
        doc.append(r'Implemented proof of concept with DQL. Articulated PPO for communication and explanation purposes.')
    with doc.create(Subsection('Christine Morataya')):
        doc.append(r'Contributed to documentation; presentation, write-ups, slide deck.')
    with doc.create(Subsection('Ryan Li')):
        doc.append(r'Contributed to documentation; write-ups. Helped experiment with hyperparameter tuning.')
    with doc.create(Subsection('Steven Yi')):
        doc.append(r"Articulated PPO for communication and explanation purposes. Contributed to documentation; write-ups.")
doc.append(NewPage())  # Insert a page break
with doc.create(Section('References')):
    with doc.create(Enumerate()) as enum:
        enum.add_item("OpenAI, Proximal policy optimization, 2018, Documentation, \n https://spinningup.openai.com/en/latest/algorithms/ppo.html")
        enum.add_item("Farama Foundation, Gymnasium, 2024, Documentaton, \n https://gymnasium.farama.org/index.html")
        enum.add_item("Farama Foundation, Stable-Retro, 2024, Documentaton, \n https://stable-retro.farama.org/")
        enum.add_item("Alex Nichol, Gotta Learn Fast: A New Benchmark for Generalization in RL, (2018), Report, \n https://arxiv.org/abs/1804.03720")
        enum.add_item("Viet Nguyen, Sonic-PPO-pytorch, 2021, GitHub repository, \n https://github.com/uvipen/Sonic-PPO-pytorch")
        enum.add_item("Rich Sutton, The Bitter Lesson, Article, 2019, \n https://www.cs.utexas.edu/~eunsol/courses/data/bitter_lesson.pdf")
while True:
    try:
        doc.generate_pdf('output', clean_tex=False)
        print("Completed!")
        break
    except:
        # clear_output()
        pass

Completed!
