In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import pandas as pd

env = gym.make('Marvin-v0')

In [None]:
# Let's try to gather the data from game samples


num_episodes = 10000
num_steps_per_episode = 1000


traning_data = []
for _ in range(num_episodes):
    prev_observation = env.reset()
    
    for step in range(num_steps_per_episode):        
        # Let's choose a random action
        action = env.action_space.sample()

        # Make your step and collect results
        observation, reward, done, info = env.step(action)
        traning_data.append([prev_observation, action, reward, done])
        prev_observation = observation
        if done == True:
            break 
    

data = pd.DataFrame(traning_data, columns=['Observation', 'Action', 'Reward', 'Done'])
# data.to_csv('./walking-marvin.csv')

In [2]:
# Piece of code to read the data from csv file
data = pd.read_csv('./walking-marvin.csv')
data.columns = ['Index', 'Observation', 'Action', 'Reward', 'Done']

In [3]:
# Let's try to explore the data we gath


data.plot.scatter(x='Index', y='Reward')
print(len(data[data['Reward'] > 0]))
print(len(data[data['Reward'] < 0].head(412394)))

positiveRewards = data[data['Reward'] > 0]
negativeRewards = data[data['Reward'] != -100].head(412394)

positiveRewards.plot(y='Reward', figsize=(18, 16))
negativeRewards.plot(y='Reward', figsize=(18, 16))

412394
412394


<matplotlib.axes._subplots.AxesSubplot at 0x11890ac88>

 I want to create the neuronNetwork that will take an observation of a enviroment (24 dimensional vector) and returns
 me an action (4 dimensional vector) I then send this action to env.step and receive a reward. Somehow i need to update
 the weights of my network using the reward. My reward should be as big as possible. (1 - reward) is my error? 
 Looks like so. 
 I need three layers of neurons. each neuron has a weight property, activation function, function to update the weight
 Each layer consist of several neurons. I need to use the weight inside the neuron. so layers just run neurons commands
 and play as a coordinator.




In [2]:
def mult_and_sum(a, b):
    result = 0
    for i in range(len(a)):
        result += a[i] * b[i]
    return result


class Neuron(object):
    def __init__(self, input_counts = 1, activation_function = math.tanh,
                update_function=None, learning_rate = 0.001, input_layer = False, value = None):
        self.input_layer = input_layer
        if self.input_layer == False:
            self.weights = [random.uniform(-1, 1)] * input_counts
        self.value = value
        self.activation_function = activation_function
        self.update_function = self.update_weights if update_function == None else update_function
        self.learning_rate = learning_rate
    
    def setValue(self, value):
        self.value = value
        

    def __str__(self):
        return "\n--------------\nself.weights: {}\nself.value: {}\n--------------\n".format(self.weights, self.value)
    
        
    def activate(self, inputs = None):
        if (self.input_layer):
            return self.value
        self.setValue(self.activation_function(mult_and_sum(inputs, self.weights)))
        return self.value
    
    def update_weights(self, inp, update):
        # I should update the weight only if the reward is bigger? 
        errors = self.learning_rate * (update - self.activate(inp))
        self.weights = np.add(self.weights, np.multiply(inp, errors))
    
        


In [3]:
def linear_classifiyer(x):
    return x

class Layer(object):
    def __init__(self, neuron_count, input_counts = 1, activation_function = linear_classifiyer):
        random.seed()
        self.neurons = []
        self.input_layer = True if input_counts == 1 else False
       
        for _ in range(neuron_count):
            self.neurons.append(Neuron(input_counts, input_layer = self.input_layer))
    
    def setValues(self, inputs):
        for index, neuron in enumerate(self.neurons):
            neuron.setValue(inputs[index])
    
    
    def __str__(self):
        message = "[input_layer: {},\nneurons: {}]"
        return message.format(self.input_layer, len(self.neurons))

    def update(self, inputs, reward):
        if (self.input_layer):
            return 
        for neuron in self.neurons:
            neuron.update_weights(inputs, reward)
    
    def output(self, inputs = None):
        layer_output = []
        for neuron in self.neurons:
            layer_output.append(neuron.activate(inputs))
        return layer_output
            
    


In [4]:
class Brain(object):
    def __init__(self, input_counts, output_counts, hidden_layers, neurons_per_hidden_layer):
        self.layers = []
        self.input_layer = Layer(neuron_count = input_counts, input_counts = 1)
        self.layers.append(self.input_layer)
        transition_input_counts = input_counts
        for _ in range(hidden_layers):
            self.layers.append( Layer(neurons_per_hidden_layer, input_counts=transition_input_counts,
                                        activation_function = math.tanh))
            transition_input_counts = neurons_per_hidden_layer
            
        self.output_layer = Layer(output_counts, input_counts = transition_input_counts, activation_function=math.tanh)
        self.layers.append(self.output_layer)
        
    def __str__(self):
        message = "\n\nlayers: {}\ninput_counts: {},\noutput_counts: {},\nhidden_layers: {}\n\n"
        return message.format(len(self.layers),
                              len(self.layers[0].neurons),
                              len(self.layers[-1].neurons),
                              len(self.layers) - 2)
    
    def generate_action(self, observation):
        self.input_layer.setValues(observation)
        inputs  = self.input_layer.output()
        for hidden_layer in self.layers[1: -1]:
            inputs = hidden_layer.output(inputs)
        return self.output_layer.output(inputs)
    
    def learn(self, observation, reward):
        self.input_layer.setValues(observation)
        layer_input = self.input_layer.output()
        for layer in self.layers[1:]:
            layer.update(layer_input, reward)
            layer_input = layer.output(layer_input)

In [5]:
marvin = Brain(24, 4, 1, 32)





In [7]:
num_episodes = 3000
max_steps_per_episode = 2000

learning_rate = 0.001
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

rewards_all_episodes = []



for episode in range(num_episodes):
    observation = env.reset()
    done = False
    rewards_current_episode = 0
    if (episode % 100 == 0):
        print("{} \n".format(episode))
    for step in range(max_steps_per_episode):
         # Exploration-Explotation trade off
#         env.render()
        exploration_rate_treshold = random.uniform(0,1)
        if exploration_rate_treshold >= exploration_rate:
            action = marvin.generate_action(observation)
        else:
            action = env.action_space.sample()

        new_observation, reward, done, info = env.step(action)
#         print(reward, done, info)
        # Update my weights according to reward on each neuron in my brain.
        marvin.learn(observation, reward)
        rewards_current_episode += reward
        if done == True:
            break
        
#     #Exploration decay logic 
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    rewards_all_episodes.append(rewards_current_episode) 
print('Done')

0 

100 

200 

300 

400 

500 

600 

700 

800 

900 

1000 

1100 

1200 

1300 

1400 

1500 

1600 

1700 

1800 

1900 

2000 

2100 

2200 

2300 

2400 

2500 

2600 

2700 

2800 

2900 

Done


In [None]:
# #Calculate and print the average reward per 1000 episodes
perE = 1000

rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes / perE)
count = perE
print("*** Average reward per {} episodes ***\n".format(perE))
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/ 1000)))
    count += perE
    

In [8]:
panda = pd.DataFramerame(rewards_all_episodes)
panda.to_csv('./result.csv')

AttributeError: module 'pandas' has no attribute 'DataFramerame'

In [30]:
observation = env.reset()
for step in range(max_steps_per_episode):
    env.render()
    action = marvin.generate_action(observation)
    new_observation, reward, done, info = env.step(action)
    marvin.learn(observation, reward)
    if (done == True):
        break 