Problem #1

In [20]:
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras import models
from tensorflow.keras import metrics
from tensorflow.keras.datasets import mnist
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#Using keras mnist data instead of given mnist_784.csv for past assignment for simplicity
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = (x_train/255).reshape(60000, 784)
x_test = (x_test/255).reshape(10000, 784)

#Input
inputImg = layers.Input(shape=(784,))

#Keeping lists of autoencoders with different compression rates
nodeCounts = [16, 8, 5, 2]
historys = []
for i in range(len(nodeCounts)):
    #Encoder
    x = layers.Dense(64, activation='relu')(inputImg)
    encoded = layers.Dense(nodeCounts[i], activation='relu')(x)
    
    #Decoder
    x = layers.Dense(64, activation='relu')(encoded)
    output = layers.Dense(784, activation='sigmoid')(x)

    #Compiling models
    encoder = models.Model(inputImg, encoded)
    autoencoder = models.Model(inputImg, output)
    autoencoder.compile(optimizer='rmsprop', loss='binary_crossentropy')
    #Decreasing batch size had a huge impact of performance but also slowed down training
    historys.append(autoencoder.fit(x_train, x_train, epochs=50, batch_size=256, shuffle=True,
                              validation_data=(x_test, x_test)))

#Plotting
plt.plot(historys[0].history['val_loss'])
plt.plot(historys[1].history['val_loss'])
plt.plot(historys[2].history['val_loss'])
plt.plot(historys[3].history['val_loss'])
plt.ylabel('loss')
plt.xlabel('epoch')
plt.title('Different Compression rate\'s (CR) loss')
plt.legend(['CR=49', 'CR=98', 'CR=156.8', 'CR=392'], loc='upper right')
plt.show()

Train on 60000 samples, validate on 10000 samples
Epoch 1/50
Epoch 2/50

KeyboardInterrupt: ignored

From the results above one can clearly see that as the compression rate increase the reconstruction accuracy goes down (loss goes up).  
**  
When using 'accuracy' metrics into the 'compile()' step trhe training started giving me values of 500+ for loss and below 0.01 for accuracy so I left that out and hoped loss will suffice to represent the performance.  
**

In [15]:
#Plotting representation of 2 node bottleneck layer

#Grabbing only classes 0, 1, 2, and 3
x_test_new = []
y_test_new = []
for i in range(len(y_test)):
    if y_test[i]<4:
        x_test_new.append(x_test[i])
        y_test_new.append(y_test[i])

y_test_new = np.array(y_test_new)
x_test_new = np.array(x_test_new)

#Generating points
x_test_encoded = encoder.predict(x_test_new, batch_size=256)
plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test_new)
plt.colorbar()
plt.show()

NameError: ignored

Problem #2

In [None]:
#I used a starting point of (2, 0)
#I used a default reward of -1 for each state
#I assumed each reward/trap state was terminal
#For Q-Learning I used a learning rate of 0.8

#With Q-Learning I got:
# [[ 0.40354785  7.36098107  2.87530616  0.327641  ]
#  [ 8.03453404  0.21923744  1.97936918  6.16347295]
#  [ 2.70918676 -0.79486551 -0.48160546  1.95353194]
#  [ 0.25152245  1.02887878 -1.799135    0.28770349]]

#With value iteration I got:
# [[ 8.          6.75        5.35185185  6.        ]
#  [ 6.75        5.49166666  4.16666666  4.68518518]
#  [ 5.31960607  3.87645742  3.0603323   3.39353479]
#  [ 1.          2.33127999  1.30116875 -4.        ]]

Problem #3

In [0]:
#Just declaring that the below code wasn't written by me

import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

from scores.score_logger import ScoreLogger

ENV_NAME = "CartPole-v1"

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995


class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                #The line underneath this comment is where the temperal difference error is calculated
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)


def cartpole():
    env = gym.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()


if __name__ == "__main__":
    cartpole()

Run: 1, exploration: 1.0, score: 9
Scores: (min: 9, avg: 9, max: 9)

Run: 2, exploration: 0.9229311239742362, score: 27
Scores: (min: 9, avg: 18, max: 27)

Run: 3, exploration: 0.8690529955452602, score: 13
Scores: (min: 9, avg: 16.333333333333332, max: 27)

Run: 4, exploration: 0.7822236754458713, score: 22
Scores: (min: 9, avg: 17.75, max: 27)

Run: 5, exploration: 0.6730128848950395, score: 31
Scores: (min: 9, avg: 20.4, max: 31)

Run: 6, exploration: 0.6180388156137953, score: 18
Scores: (min: 9, avg: 20, max: 31)

Run: 7, exploration: 0.5704072587541458, score: 17
Scores: (min: 9, avg: 19.571428571428573, max: 31)

Run: 8, exploration: 0.5398075216808175, score: 12
Scores: (min: 9, avg: 18.625, max: 31)

Run: 9, exploration: 0.5057535983897912, score: 14
Scores: (min: 9, avg: 18.11111111111111, max: 31)

Run: 10, exploration: 0.4810273709480478, score: 11
Scores: (min: 9, avg: 17.4, max: 31)

Run: 11, exploration: 0.4484282034609769, score: 15
Scores: (min: 9, avg: 17.181818181818

NameError: ignored

The code only ran the program on one CPU core which made it horrendously slow, and when I tried to render the gym environment to visualize how it actually performed it ran the image/video AND the training on my CPU and used 24GB of ram on my system!

Part (a)  
I added a comment above the proper line above^^^^  
    
Part (b)  
According to the linked article and some of my own experiments I found the sufficient number of steps to run the program to be around 131 and that corresponds to a average score of 195 (which is defined as the solving criteria). The author came to these conclusion after running 30 trials until each one solved (got an average score of 195 or above) and then averaged the number of steps over the 30 trials. I ran it myself 6 times until solving and got results of [120, 146, 137, 141, 121, 175]steps. So a good number to choose would be around 131 steps or to be extra confident that it would solve almost every trial I would choose 150 steps.

Part (c)  
For this part I changed the 'act()' function to the following:

In [0]:
def act2(self, state):
    return random.randrange(self.action_space)

In [0]:
class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    #Changed the below function
    def act2(self, state):
      return random.randrange(self.action_space)

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                #The line underneath this comment is where the temperal difference error is calculated
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

def cartpole():
    env = gym.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act2(state) #Change on this line
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()

cartpole()

Run: 1, exploration: 1.0, score: 16
Scores: (min: 16, avg: 16, max: 16)

Run: 2, exploration: 0.9000874278732445, score: 25
Scores: (min: 16, avg: 20.5, max: 25)

Run: 3, exploration: 0.7439808620067382, score: 39
Scores: (min: 16, avg: 26.666666666666668, max: 39)

Run: 4, exploration: 0.7076077347272662, score: 11
Scores: (min: 11, avg: 22.75, max: 39)

Run: 5, exploration: 0.6242658676435396, score: 26
Scores: (min: 11, avg: 23.4, max: 39)

Run: 6, exploration: 0.5937455908197752, score: 11
Scores: (min: 11, avg: 21.333333333333332, max: 39)

Run: 7, exploration: 0.567555222460375, score: 10
Scores: (min: 10, avg: 19.714285714285715, max: 39)

Run: 8, exploration: 0.4907693883854626, score: 30
Scores: (min: 10, avg: 21, max: 39)

Run: 9, exploration: 0.4529463432347434, score: 17
Scores: (min: 10, avg: 20.555555555555557, max: 39)

Run: 10, exploration: 0.4016064652978155, score: 25
Scores: (min: 10, avg: 21, max: 39)

Run: 11, exploration: 0.37251769488706843, score: 16
Scores: (mi

KeyboardInterrupt: ignored

And then after 131 steps I was left with an average score of: 20.71, well under 195 it should be near.

Part (d)  
To keep exploration at .2 I changed the following global variables: (I also used the original act() 'function')

In [0]:
EXPLORATION_MAX = 0.2  
EXPLORATION_MIN = 0.2   
EXPLORATION_DECAY = 1

class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    #Changed the below function
    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                #The line underneath this comment is where the temperal difference error is calculated
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)
        
def cartpole():
    env = gym.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act(state) #Change on this line
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()

cartpole()

Run: 1, exploration: 0.2, score: 14
Scores: (min: 14, avg: 14, max: 14)

Run: 2, exploration: 0.2, score: 9
Scores: (min: 9, avg: 11.5, max: 14)

Run: 3, exploration: 0.2, score: 9
Scores: (min: 9, avg: 10.666666666666666, max: 14)

Run: 4, exploration: 0.2, score: 9
Scores: (min: 9, avg: 10.25, max: 14)

Run: 5, exploration: 0.2, score: 12
Scores: (min: 9, avg: 10.6, max: 14)

Run: 6, exploration: 0.2, score: 8
Scores: (min: 8, avg: 10.166666666666666, max: 14)

Run: 7, exploration: 0.2, score: 11
Scores: (min: 8, avg: 10.285714285714286, max: 14)

Run: 8, exploration: 0.2, score: 16
Scores: (min: 8, avg: 11, max: 16)

Run: 9, exploration: 0.2, score: 13
Scores: (min: 8, avg: 11.222222222222221, max: 16)

Run: 10, exploration: 0.2, score: 12
Scores: (min: 8, avg: 11.3, max: 16)

Run: 11, exploration: 0.2, score: 10
Scores: (min: 8, avg: 11.181818181818182, max: 16)

Run: 12, exploration: 0.2, score: 13
Scores: (min: 8, avg: 11.333333333333334, max: 16)

Run: 13, exploration: 0.2, scor

NameError: ignored

Then it reached an average score of 197.47 after 127 steps.  
Which is on par with the default results. Could have been a good run, but we'll leave it at that.