<a href="https://colab.research.google.com/github/ManikantaMandlem/Deep-Q-Network-to-play-2048-Game/blob/main/DRL_Final_Project_cartpole_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Try on carpole problem to see if things are working or not - this should be an easy thing to do


In [None]:
!pip install gym-2048

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import gym_2048
import gym
import torch.nn as nn
import torch
from collections import deque
import numpy as np
from datetime import datetime
from random import sample
import matplotlib.pyplot as plt
import time

In [None]:
#defining a neural network class for network initializations

class torch_model(nn.Module):
    def __init__(self,layers,dropout=0,normalization=False,hidden_activation='relu'):
        super(torch_model,self).__init__()
        self.layers = layers
        self.dropout = dropout
        self.normalization = normalization
        self.hidden_activation = hidden_activation

        #defining different layers
        self.nn_layers = []
        for i in range(len(self.layers)-2):
            self.nn_layers.append(nn.Linear(self.layers[i],self.layers[i+1]))
            if hidden_activation == 'relu':
                self.nn_layers.append(nn.ReLU())
            elif hidden_activation == 'sigmoid':
                self.nn_layers.append(nn.Sigmoid())
            elif hidden_activation == 'silu':
                self.nn_layers.append(nn.SiLU())
            elif hidden_activation == 'tanh':
                self.nn_layers.append(nn.Tanh())
            elif hidden_activation == 'celu':
                self.nn_layers.append(nn.CELU())
            else:
                raise Exception('activation function not recognized; available options are relu, sigmoid, silu, tanh, celu')
            if normalization:
                self.nn_layers.append(nn.BatchNorm1d(self.layers[i+1]))
            self.nn_layers.append(nn.Dropout(self.dropout))
        self.nn_layers.append(nn.Linear(self.layers[-2],self.layers[-1]))
        self.nn_layers = nn.ModuleList(self.nn_layers)
    def forward(self,x):
        for layer in self.nn_layers:
            x = layer(x)
        return x

In [None]:
def print_model(model):

    # Print model's state_dict
    print("Model's state dictionary (stored weights):")
    for param_tensor in model.state_dict():
        print("  ", param_tensor, "\t", tuple(model.state_dict()[param_tensor].size()))

    # Print the number of parameters in the model    
    parameter_count =  sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("In total, this network has ", parameter_count, " trainable parameters")

In [None]:
class rl_agent():
    def __init__(self,kwargs):
        self.environment = gym.make('CartPole-v1')
        self.state_size = self.environment.observation_space.shape[0]#*self.environment.observation_space.shape[1]
        self.action_size = self.environment.action_space.n
        self.episodes = kwargs['episodes']
        self.memory = deque(maxlen=kwargs['memory_length'])
        self.gamma = kwargs['gamma']    # discount rate
        self.epsilon = kwargs['epsilon_start']  # exploration rate
        self.epsilon_min = kwargs['epsilon_min']
        self.epsilon_decay = kwargs['epsilon_decay']
        self.batch_size = kwargs['batch_size']
        self.train_start = kwargs['train_start']
        self.model = torch_model(layers=kwargs['layers'],dropout=kwargs['dropout'],
                                 normalization=kwargs['batch_norm'],hidden_activation=kwargs['activation'])
        # print_model(self.model)
        #defining memory to remember states, next_states, rewards, actions
        self.memory = deque(maxlen = 10000)
        self.loss_fn = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=kwargs['lr'])

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > self.train_start and self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    #training the neural network to approximate the q function
    def replay(self):
        if len(self.memory) < self.train_start: #if enough examples are not present, then do not train the neural network
            # print('model is not training')
            return
        # Randomly sample minibatch from the memory
        minibatch = sample(self.memory, self.batch_size)
        state = torch.zeros((self.batch_size, self.state_size))
        next_state = torch.zeros((self.batch_size, self.state_size))
        action, reward, done = [], [], []
        # assign data into state, next_state, action, reward and done from minibatch
        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            next_state[i] = minibatch[i][3]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            done.append(minibatch[i][4])
        # compute value function of current(call it target) and value function of next state(call it target_next)
        self.model.train()
        q_target = self.model(state)
        q_target_next = self.model(next_state)
        for i in range(self.batch_size):
            # correction on the Q value for the action used,
            # if done[i] is true, then the target should be just the final reward
            if done[i]:
                q_target[i][action[i]] = reward[i]
            else:
                # else, use Bellman Equation
                # Standard - DQN
                # DQN chooses the max Q value among next actions
                # selection and evaluation of action is on the target Q Network
                # target = max_a' (r + gamma*Q_target_next(s', a'))
                q_target[i][action[i]] = reward[i] + self.gamma * torch.max(q_target_next[i])
        # Train the Neural Network with batches where target is the value function
        # self.model.fit(state, q_target, batch_size=self.batch_size, verbose=0)
        # print('model is training')
        self.optimizer.zero_grad()
        q_values = self.model(state)
        loss = self.loss_fn(q_values,q_target)
        loss.backward()
        self.optimizer.step()


    def training(self):
        for episode in range(self.episodes):
            state = torch.from_numpy(self.environment.reset()) #resetting the game environment and capturing the first state of the game
            state = torch.reshape(state,(1,self.state_size))
            state = state.float()
            # print(state)
            # print(type(state))
            # print(state.shape)
            done = False
            acc_reward = 0
            i = 0
            while not done: #untill the game is not over, play the game
                #take the epsilon greedy action
                # with torch.no_grad():
                # self.environment.render()
                # time.sleep(5)
                self.model.eval()
                q_values = self.model(state)
                greedy_action = torch.argmax(q_values).item()
                random_action = np.random.choice([0,1])
                final_action = np.random.choice([greedy_action,random_action],p=[1-self.epsilon,self.epsilon])
                #run the selected action and get the next state, reward, done
                next_state, reward, done, _ = self.environment.step(final_action)
                next_state = torch.from_numpy(next_state.copy())
                next_state = torch.reshape(next_state, (1, self.state_size))
                next_state = next_state.float()
                #may need to change and test other rewarding methods like max(tiles), difference between max tile and 2048 etc
                if not done or i == self.environment._max_episode_steps-1:
                    reward = reward # Reward --> +1
                else:
                    reward = -100 # Reward = -100
                #storing the state-action-nextstate-reward-done touple in memory for training purpose
                # acc_reward += reward
                self.remember(state, final_action, reward, next_state, done)
                # print(reward)
                #updating the state to next_state
                state = next_state
                i = i+1
                if done:
                    #return_rewards.append(i)
                    dateTimeObj = datetime.now()
                    timestampStr = dateTimeObj.strftime("%H:%M:%S")
                    print("episode: {}/{}, score: {}, e: {:.2}, time: {}".format(episode+1, self.episodes, i,self.epsilon, timestampStr))
                    # save model option
                    # if i >= 500:
                    #     print("Saving trained model as cartpole-dqn-training.h5")
                    #     self.save("./save/cartpole-dqn-training.h5")
                    #     return # remark this line if you want to train the model longer
                self.replay()

        

In [None]:
kwargs = {
    'episodes':200,
    'gamma':0.95, # changed
    'epsilon_start':1.0,
    'epsilon_min':0.001, # changed from 0.01
    'epsilon_decay':0.999, # changed to 0.995
    'batch_size':64,
    'train_start':1000,
    'layers':[4,512,256,64,2],
    'dropout':0.0,
    'batch_norm':True, #changed from true
    'activation':'relu',
    'lr':0.00025, #changed from 0.001
    'memory_length':2000
}

In [None]:
if __name__ == "__main__":
    # rewards_final = []
    # runs = 20
    # for run in range(runs):
    agent = rl_agent(kwargs)
    agent.training()
        # rewards_final.append(agent.training())
    # rewards_final  =np.array(rewards_final)
    # rewards_avg = np.average(rewards_final,axis = 0)
    # rewards_std = np.std(rewards_final,axis = 0)
    # plt.plot(list(range(1,151)),rewards_avg,color = 'red',label='average')
    # plt.fill_between(list(range(1,151)),rewards_avg-rewards_std,rewards_avg+rewards_std,color='red',label='+/- 1 std',alpha=0.2)
    # plt.xlabel('episode #')
    # plt.ylabel('reward')
    # plt.title('mean and std over 100 runs of reward')
    # plt.legend()
    # plt.show()

episode: 1/200, score: 39, e: 1.0, time: 00:52:17
episode: 2/200, score: 12, e: 1.0, time: 00:52:17
episode: 3/200, score: 51, e: 1.0, time: 00:52:17
episode: 4/200, score: 28, e: 1.0, time: 00:52:17
episode: 5/200, score: 20, e: 1.0, time: 00:52:17
episode: 6/200, score: 12, e: 1.0, time: 00:52:17
episode: 7/200, score: 30, e: 1.0, time: 00:52:17
episode: 8/200, score: 22, e: 1.0, time: 00:52:17
episode: 9/200, score: 46, e: 1.0, time: 00:52:17
episode: 10/200, score: 28, e: 1.0, time: 00:52:17
episode: 11/200, score: 51, e: 1.0, time: 00:52:17
episode: 12/200, score: 16, e: 1.0, time: 00:52:17
episode: 13/200, score: 12, e: 1.0, time: 00:52:17
episode: 14/200, score: 10, e: 1.0, time: 00:52:17
episode: 15/200, score: 24, e: 1.0, time: 00:52:17
episode: 16/200, score: 26, e: 1.0, time: 00:52:17
episode: 17/200, score: 19, e: 1.0, time: 00:52:17
episode: 18/200, score: 15, e: 1.0, time: 00:52:17
episode: 19/200, score: 17, e: 1.0, time: 00:52:17
episode: 20/200, score: 13, e: 1.0, time

KeyboardInterrupt: ignored