## Let's start with downloading necessary libraries

In [1]:
import gym
import numpy as np
from gym import wrappers

from numpy import *
from numpy.random import uniform, normal
from numpy.linalg import norm

from random import shuffle
from collections import deque
from statistics import mean

In [2]:
env = gym.make('CartPole-v1')

## Declare global variables

In [3]:
alpha = 2.0e-1           # the 'learning rate'
maxEpisodes = 1000       # run the agent for 'maxEpisodes'
maxTimeSteps = 500       # maximum number of steps per episode
fixedNorm = 0.5          # output weights are scaled to have norm = 'fixedform'
maxHistory = 5000        # maximum number of recent observations for replay
solvedEpisodes = 100     # cartpole is solved when average reward > 195 for 'solvedEpisodes'
episodeLength = 500      # the target for CartPole-v1

### Observations Transform

In [4]:
inputLength = 4        # length of an observation vector
expansionFactor = 30   # expand observation dimensions by 'expansionFactor'
expandedLength = expansionFactor*inputLength # length of transformed observations

### Feature transform with fixed random weights

In [20]:
V = normal(scale=1.0, size=(expandedLength, inputLength))

### Output weights, randomly initialized

In [6]:
W = uniform(low=-1.0, high=1.0, size=expandedLength)

#### Fix the norm of the output weights to 'fixedNorm'

In [7]:
W *= fixedNorm/norm(W)

# Cart Pole NN implementation

In [22]:
def CartPoleAgent(aplha, W, V):
    #--------------------------------------------
    # Observation history
    H = deque([], maxHistory)
    # episode total reward history
    R = deque([], solvedEpisodes)
    # histories of positive and negative outputs
    PO = deque([0], maxHistory)
    NO = deque([0], maxHistory)
    #--------------------------------------------
    for episode in range(maxEpisodes):
        observation = env.reset()
        H.append(observation)
        totalReward = 0
        for t in range(1, maxTimeSteps + 1):
            env.render()
            #--------------------------------------------
            out = dot(tanh(dot(V, observation)), W)
            if out < 0:
                NO.append(out)
                action = 0
            else:
                PO.append(out)
                action = 1
            #--------------------------------------------
            observation, reward, done, info = env.step(action)
            H.append(observation)
            totalReward += reward
            #--------------------------------------------
            if done:
                R.append(totalReward)
                if t < episodeLength:
                    #--------------------------------------------
                    # Replay shuffled past observation using the
                    # latest weights
                    # Use the means of past outputs as
                    # LMS algorithm target outputs
                    #--------------------------------------------
                    mn = mean(NO)
                    mp = mean(PO)
                    shuffle(H)
                    for obs in H:
                        h = tanh(dot(V, obs))     # transform the observation
                        out = dot(h, W)
                        if out < 0:
                            e = mn - out
                        else:
                            e = mp - out
                        W += aplha * e * h        # Widrow-Hoff LMS update
                        W *= fixedNorm/norm(W)    # keep the weights at fixed norm
                    #---------------------------------------------
                #--------------------------------------------------
                avgReward = sum(R)/solvedEpisodes
                print(f"[{episode:3d}:{totalReward:3.0f}] R:{avgReward:6.2f} mp:{mean(PO):7.3f} mn:{mean(NO):7.3f}  len(H):{len(H):4d}  W:{W[:2]}", flush=True)
                #---------------------------------------------
                if avgReward == episodeLength:
                    print("Solved")
                    return
                #---------------------------------------------
                break
        #---------------------------------------------
    #---------------------------------------------

In [27]:
CartPoleAgent(alpha, W, V)
env.close()

[  0: 49] R:  0.49 mp:  0.416 mn: -0.176  len(H):  50  W:[ 0.12690107 -0.04083459]
[  1:101] R:  1.50 mp:  0.253 mn: -0.142  len(H): 152  W:[ 0.08298972 -0.02048573]
[  2: 10] R:  1.60 mp:  0.253 mn: -0.247  len(H): 163  W:[ 0.03370082 -0.00082018]
[  3:105] R:  2.65 mp:  0.546 mn: -0.418  len(H): 269  W:[-0.02553592 -0.03031049]
[  4: 61] R:  3.26 mp:  0.530 mn: -0.366  len(H): 331  W:[0.10380925 0.00873646]
[  5: 41] R:  3.67 mp:  0.502 mn: -0.355  len(H): 373  W:[ 0.1075776 -0.0419355]
[  6: 47] R:  4.14 mp:  0.457 mn: -0.329  len(H): 421  W:[ 0.03350791 -0.0431425 ]
[  7:  9] R:  4.23 mp:  0.457 mn: -0.385  len(H): 431  W:[-0.02391491  0.02926472]
[  8: 10] R:  4.33 mp:  0.457 mn: -0.515  len(H): 442  W:[-0.01296161 -0.02240454]
[  9:  9] R:  4.42 mp:  0.457 mn: -0.531  len(H): 452  W:[ 0.07488064 -0.04111523]
[ 10: 89] R:  5.31 mp:  0.453 mn: -0.521  len(H): 542  W:[-0.09699483  0.01210728]
[ 11: 15] R:  5.46 mp:  0.442 mn: -0.511  len(H): 558  W:[-0.03255088  0.05810095]
[ 12:103