In [2]:
from ast import Sub
from pylatex.utils import italic, bold, NoEscape
from pylatex import Document, Section, Subsection, Subsubsection, Command, Tabular, Math, TikZ, Axis, Plot, Figure, Matrix, Alignat, NewPage, NewLine
from pylatex.section import Paragraph, Subparagraph, Chapter
from pylatex.utils import italic
import numpy as np

# =================================[PAGE SETUP]=================================
geometry_options = { "margin": "1in", "includeheadfoot": False }
doc = Document(geometry_options=geometry_options, lmodern = True)

# =================================[TITLE]=================================

doc.preamble.append(Command('title', NoEscape(r'Gameplaying AI: Reimplementing Proximal Policy Optimization\\ \large Introduction to Artificial Intelligence Project\\ \large ECS 170 Fall 2024')))
doc.preamble.append(Command('author', NoEscape(r'Darroll Saddi^\(1\), Andrew Yeow^\(1\), Christine Morayata^\(?\), Julia Heiler^\(?\), Ryan Li^\(1\), Steven Yi^\(?\)\\ \small^\(1\)University of California, Davis - Computer Science\\\small^\(2\)University of California, Davis - Cognitive Science')))
doc.preamble.append(Command('date', NoEscape(r'\today')))
doc.append(NoEscape(r'\maketitle'))

# Abstract
doc.append(NoEscape(r'\begin{abstract}This report documents a retrospective reimplementation of proximal policy optimization (PPO), performed at UC Davis for study purposes. Our goal was to not only use the algorithm to succesfully train a model to play Sonic the Hedgehog^\(TM\), but to additionally discover how to use, articulate, and implement reinforcement learning algorithms while being able to communicate PPO and reinforcement learning through practical experience. We also present this report as an insightful and educational resource for those interested in reinforcement learning and/or recreating our training environment.\end{abstract}'))

# Introduction
with doc.create(Section('Introduction')):
    doc.append(NoEscape(r'Proximal Policy Optimization (PPO) is a reinforcement learning algorithm that has been used to train agents in continuous action space environments, including video games. It is a policy gradient method that is designed to be simple to implement and computationally efficient. Being a on-policy method, the algorithm learns a policy to make decisions in the environment. This is different from an off-policy method, which learns the value of the optimal policy independently of the agent\'s action. In this project, we reimplemented PPO and used it to train an agent to play and complete levels in Sonic the Hedgehog^\(TM\). On a high level, PPO works by choosing an action for the agent to take, then observing the resultant state and reward. Then, an estimate for the advantage gain is computed \(see GAE\), which basically measures how much better the action was compared to the average action at that state. The policy is then updated using a special objective function to prevent the policy from updating too mcuh on a single episode by penalizing if the new policy deviates too much from the original policy, ensuring stability. These steps repeat until training is complete. Here is an example of a citation\footnote{This is a reference}. Here is another reference\footnote{This is another reference}. This is how I would cite the first reference again^\(1\).'))
    # More context on PPO needed above
    # TODO: Be sure to draw on multiple sources for this introduction. Explain motivation for project.
    with doc.create(Subsection('Use-Cases')):
        doc.append(NoEscape(r'PPO is a versatile algorithm that can be used in a variety of environments. It is particularly well-suited for continuous action spaces, which makes it a good candidate for training agents in games like Sonic the Hedgehog^\(TM\).'))
        # TODO: Talk about the build-up to PPO, and how it excels over its predecessors.
    with doc.create(Subsection('PPO Implementation')):
        # TODO: PLS REWRITE + ADD REFERENCES
        with doc.create(Paragraph('Actor-Critic')):
            doc.append(NoEscape(r'PPO is an actor-critic algorithm, which means that it uses two neural networks to approximate the policy and value functions. The actor network takes the current state as input and outputs a probability distribution over possible actions. The critic network takes the current state as input and outputs an estimate of the value of the state. Both use the metric of "advantage" to update weights, which measures how much better taking a particular action is compared to the average action.'))
        with doc.create(Paragraph('Generalized Advantage Estimation (GAE)')):
            doc.append(NoEscape(r'This function \'how good is the current state\'. It is used to calculate the advantage of taking an action in a given state, which is then used to update the policy. The advantage is a measure of how much better an action is than the average action, and it is used to determine how to update the policy to improve performance.'))
        with doc.create(Paragraph('Surrogate Objective Function')):
            doc.append(NoEscape(r'The key to PPO is the surrogate objective function, which helps maximize the probability of taking an action that may eventually lead to high reward. Its main purpose is to keep updates within a trust region, using clipping (see below).'))
        with doc.create(Paragraph('Clipping')):
            doc.append(NoEscape(r'This is a technique used to prevent the policy from changing too much between updates. It is used to ensure that the policy does not change too much between updates, which would normally lead to training instability. It does this clipping the probability ratio between the new and old policies.'))
            # TODO: invest in some visualization
            # TODO: include an image that connects every part of the algorithm
            """here is just an example of what math things can be displayed using pylatex"""
            a = np.array([[100, 10, 20]]).T
            M = np.matrix([[2, 3, 4],
                            [0, 0, 1],
                            [0, 0, 2]])
            doc.append(Math(data=['2*3', '=', 9]))
            with doc.create(Tabular('rc|cl')) as table:
                table.add_hline()
                table.add_row((1, 2, 3, 4))
                table.add_hline(1, 2)
                table.add_empty_row()
                table.add_row((4, 5, 6, 7))
            doc.append(Math(data=[Matrix(M), Matrix(a), '=', Matrix(M * a)]))
            with doc.create(Alignat(numbering=False, escape=False)) as agn:
                agn.append(r'\frac{a}{b} &= 0 \\')

with doc.create(Section('GitHub')):
    doc.append(r"A repository containing the code used in this project can be found at:")
    doc.append(NewLine())
    doc.append(NewLine())
    doc.append(r"https://github.com/Iemontine/ProximalPolicyOptimization.")

    # TODO: move this
    with doc.create(Subsection('Cool graph')):
        with doc.create(Figure(position='h!')) as img:
            image_filename = "Pass1-3_500k_-0.01punishment_2.jpg"
            img.add_image(image_filename, width='300px')
            img.add_caption('Look at this photograph')

doc.append(NewPage())  # Insert a page break

with doc.create(Section('Contributions')):
    with doc.create(Subsection('Darroll Saddi')):
        doc.append(r'Set up training environment, including integrating libraries & frameworks. Performed inference- and intuition-based hyperparameter tuning to produce training results. Designed reward function & investigated multiple implementations including the multi-pass implementation. Implemented metric-tracking during training and wrote plotting code. Contributed to documentation on GitHub for recreating the environment and others. Contributed research efforts and project direction.')
    with doc.create(Subsection('Steven Yi')):
        doc.append(r"Refactored OpenAI's proximal policy optimization implementation to suit our purposes.")
    with doc.create(Subsection('Andrew Yeow')):
        doc.append(r'Implemented proof of concept with DQL.')
    with doc.create(Subsection('Julia Heiler')):
        doc.append(r'Contributed to documentation.')
    with doc.create(Subsection('Ryan Li')):
        doc.append(r'Contributed to documentation.')
    with doc.create(Subsection('Christine Morayata')):
        doc.append(r'Contributed to documentation.')
try:
    doc.generate_pdf('output', clean_tex=True)
    print("Completed!")
except Exception as e:
    pass

Completed!
