# RL and Advanced DL: Домашнее задание 2



In [1]:
%autosave 60

Autosaving every 60 seconds


In [249]:
import gym
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math
import random
import numpy as np
import scipy as sp
import scipy.stats as st
import scipy.integrate as integrate
from scipy.stats import multivariate_normal
from sklearn import linear_model
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
import statsmodels.api as sm
from matplotlib.colors import LogNorm
import pickle
import copy

from joblib import Parallel, delayed
import multiprocessing
from collections import namedtuple,deque
from itertools import count
from itertools import product
import cProfile
from datetime import datetime
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.autograd import Variable

sns.set_style("whitegrid")
sns.set_palette("colorblind")
palette = sns.color_palette()
figsize = (15,8)
legend_fontsize = 16

from matplotlib import rc
rc('font',**{'family':'sans-serif'})
rc('text', usetex=True)
# rc('text.latex',preamble=r'\usepackage[utf8]{inputenc}')
# rc('text.latex',preamble=r'\usepackage[russian]{babel}')
rc('figure', **{'dpi': 300})

## Классы для крестиков ноликов

In [3]:
N_ROWS, N_COLS, N_WIN = 3, 3, 3

In [4]:
class TicTacToe(gym.Env):
    def __init__(self, n_rows=N_ROWS, n_cols=N_COLS, n_win=N_WIN, clone=None):
        if clone is not None:
            self.n_rows, self.n_cols, self.n_win = clone.n_rows, clone.n_cols, clone.n_win
            self.board = copy.deepcopy(clone.board)
            self.curTurn = clone.curTurn
            self.emptySpaces = None
            self.boardHash = None
        else:
            self.n_rows = n_rows
            self.n_cols = n_cols
            self.n_win = n_win

            self.reset()

    def getEmptySpaces(self):
        if self.emptySpaces is None:
            res = np.where(self.board == 0)
            self.emptySpaces = np.array([ (i, j) for i,j in zip(res[0], res[1]) ])
        return self.emptySpaces

    def makeMove(self, player, i, j):
        self.board[i, j] = player
        self.emptySpaces = None
        self.boardHash = None

    def getHash(self):
        if self.boardHash is None:
            self.boardHash = ''.join(['%s' % (x+1) for x in self.board.reshape(self.n_rows * self.n_cols)])
        return self.boardHash

    def isTerminal(self):
        # проверим, не закончилась ли игра
        cur_marks, cur_p = np.where(self.board == self.curTurn), self.curTurn
        for i,j in zip(cur_marks[0], cur_marks[1]):
            win = False
            if i <= self.n_rows - self.n_win:
                if np.all(self.board[i:i+self.n_win, j] == cur_p):
                    win = True
            if not win:
                if j <= self.n_cols - self.n_win:
                    if np.all(self.board[i,j:j+self.n_win] == cur_p):
                        win = True
            if not win:
                if i <= self.n_rows - self.n_win and j <= self.n_cols - self.n_win:
                    if np.all(np.array([ self.board[i+k,j+k] == cur_p for k in range(self.n_win) ])):
                        win = True
            if not win:
                if i <= self.n_rows - self.n_win and j >= self.n_win-1:
                    if np.all(np.array([ self.board[i+k,j-k] == cur_p for k in range(self.n_win) ])):
                        win = True
            if win:
                self.gameOver = True
                return self.curTurn

        if len(self.getEmptySpaces()) == 0:
            self.gameOver = True
            return 0

        self.gameOver = False
        return None

    def printBoard(self):
        for i in range(0, self.n_rows):
            print('----'*(self.n_cols)+'-')
            out = '| '
            for j in range(0, self.n_cols):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('----'*(self.n_cols)+'-')

    def getState(self):
        return (self.getHash(), self.getEmptySpaces(), self.curTurn)

    def action_from_int(self, action_int):
        return ( int(action_int / self.n_cols), int(action_int % self.n_cols))

    def int_from_action(self, action):
        return action[0] * self.n_cols + action[1]
    
    def step(self, action):
        if self.board[action[0], action[1]] != 0:
            return self.getState(), -10, True, {}
        self.makeMove(self.curTurn, action[0], action[1])
        reward = self.isTerminal()
        self.curTurn = -self.curTurn
        return self.getState(), 0 if reward is None else reward, reward is not None, {}

    def reset(self):
        self.board = np.zeros((self.n_rows, self.n_cols), dtype=int)
        self.boardHash = None
        self.gameOver = False
        self.emptySpaces = None
        self.curTurn = 1

In [5]:
def plot_board(env, pi, showtext=True, verbose=True, fontq=20, fontx=60):
    '''Рисуем доску с оценками из стратегии pi'''
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    X, Y = np.meshgrid(np.arange(0, env.n_rows), np.arange(0, env.n_rows))
    Z = np.zeros((env.n_rows, env.n_cols)) + .01
    s, actions = env.getHash(), env.getEmptySpaces()
    if pi is not None and s in pi.Q:
        for i, a in enumerate(actions):
            Z[a[0], a[1]] = pi.Q[s][i]
    ax.set_xticks([])
    ax.set_yticks([])
    surf = ax.imshow(Z, cmap=plt.get_cmap('Accent', 10), vmin=-1, vmax=1)
    if showtext:
        for i,a in enumerate(actions):
            if pi is not None and s in pi.Q:
                ax.text( a[1] , a[0] , "%.3f" % pi.Q[s][i], fontsize=fontq, horizontalalignment='center', verticalalignment='center', color="w" )
    for i in range(env.n_rows):
        for j in range(env.n_cols):
            if env.board[i, j] == -1:
                ax.text(j, i, "O", fontsize=fontx, horizontalalignment='center', verticalalignment='center', color="w" )
            if env.board[i, j] == 1:
                ax.text(j, i, "X", fontsize=fontx, horizontalalignment='center', verticalalignment='center', color="w" )
#     cbar = plt.colorbar(surf, ticks=[0, 1])
    ax.grid(False)
    plt.show()

def get_and_print_move(env, pi, s, actions, random=False, verbose=True, fontq=20, fontx=60):
    '''Делаем ход, рисуем доску'''
    plot_board(env, pi, fontq=fontq, fontx=fontx)
    if verbose and (pi is not None):
        if s in pi.Q:
            for i,a in enumerate(actions):
                print(i, a, pi.Q[s][i])
        else:
            print("Стратегия не знает, что делать...")
    if random:
        return np.random.randint(len(actions))
    else:
        return pi.getActionGreedy(s, len(actions))


# Табличный Q-learning


## Идея: сделаем два агента, которые будут играть друг против друга по своим табличным функциям 


In [280]:
class SexyEnvironment():
    def __init__(self,base_env):
        
        self.env = base_env
        self.env.reset()
        
        self.action_space_dim = len(self.env.getState()[1])
        self.state_space_dim = 3** len(self.env.getState()[0])
        self.state_spaces = len(self.env.getState()[0])
        
        # self.state_to_int={v:k for k,v in enumerate(product('012', repeat=self.state_space_dim))}
        # self.state = self._get_state()
        
    def reset(self):
        self.env.reset()
        return tuple(map(int,self.env.getState()[0]))
    @property
    def state(self):
        return tuple(map(int,self.env.getState()[0]))
        
    def step(self,action):
        env_output = self.env.step(self.env.action_from_int(action))
        state= tuple(map(int,env_output[0][0]))
        reward = env_output[1]
        terminated = env_output[2]
        
        return state,reward,terminated
    def sample(self):
        return np.random.choice(list(range(self.action_space_dim)))

In [187]:
class TableQlearning():
    def __init__(self,action_space_dim,state_spaces,alpha=0.1,gamma=0.1):
        self.Q = np.random.uniform(size=(tuple([3 for _ in range(state_spaces)])+(action_space_dim,)))
        self.max_learning_iterations = 10_000
        self.alpha = alpha
        self.gamma = gamma
        
        self.epsilon = 0.5
        self.epsilon_discount_factor = 0.99
        self.epsilon_decrease_every = 10**4
        self.counter = 1
        self.Q_prev = None
        
        
        
    def get_greedy_action(self, s):
        
        if np.random.uniform() < 1-self.epsilon:
            return np.argmax(self.Q[s])
        else:
            return np.random.choice(list(range(len(self.Q[s]))))
        
    def step(self,s_prev,action,reward,s):
        if self.counter % self.epsilon_decrease_every  ==0:
            self.epsilon = self.epsilon_discount_factor*self.epsilon
       
        self.Q[s_prev][action] = self.Q[s_prev][action]+self.alpha*(reward+self.gamma*np.argmax(self.Q[s]) - self.Q[s_prev][action])
      
        self.counter+=1
    
    
    def get_optimal_action(self,s):
        return np.argmax(self.Q[s])
    
    
        
    
        
        
        
        

In [209]:
def Q_learn(agent,env,num_games=10):
    rewards=[]
    for game_num in range(num_games):
        s_prev = env.reset()
        
        terminate=False
    
        
        while not terminate:
            action = agent.get_greedy_action(s_prev)
            s,raw_reward,terminate = env.step(action)
            
            if raw_reward ==-1:
                reward=1
            else:
                reward = raw_reward
                
            agent.step(s_prev,action,reward,s)
            s_prev=s
            
            
        rewards.append(raw_reward)
            
       
        if game_num % 10000==0:
            
            print(f'Game #{game_num}\t Reward {np.mean(rewards[-100:]):.3f}\tEps{agent.epsilon}')
            
    return rewards
                
        

In [210]:
other_env = TicTacToe(n_rows=3, n_cols=3, n_win=3)

In [211]:
env=SexyEnvironment(other_env)

In [212]:
agent = TableQlearning(action_space_dim=env.action_space_dim,state_spaces=env.state_spaces)



In [213]:
rewards = Q_learn(agent,env,num_games=1000000)

Game #0	 Reward -10.000	Eps0.5
Game #10000	 Reward -8.060	Eps0.47549502494999996
Game #20000	 Reward -6.250	Eps0.45219103750440215
Game #30000	 Reward -5.730	Eps0.43002917732064416
Game #40000	 Reward -3.980	Eps0.4089534687986153
Game #50000	 Reward -4.480	Eps0.38502157290257755
Game #60000	 Reward -4.730	Eps0.3661516848271987
Game #70000	 Reward -4.400	Eps0.34820660902478673
Game #80000	 Reward -4.290	Eps0.33114102049199173
Game #90000	 Reward -3.670	Eps0.31491181560161613
Game #100000	 Reward -4.330	Eps0.2964832232007497
Game #110000	 Reward -3.720	Eps0.2819525952261938
Game #120000	 Reward -3.390	Eps0.2681341126035925
Game #130000	 Reward -3.340	Eps0.2524429443935348
Game #140000	 Reward -4.100	Eps0.2400707282857106
Game #150000	 Reward -2.600	Eps0.22830487387195728
Game #160000	 Reward -1.510	Eps0.21711566339590585
Game #170000	 Reward -2.810	Eps0.20441008721127468
Game #180000	 Reward -2.450	Eps0.19439195903711348
Game #190000	 Reward -2.330	Eps0.18486481882486328
Game #200000	 Re

In [208]:
rewards

In [217]:
pd.Series(rewards).plot()

<AxesSubplot:>

RuntimeError: Failed to process string with tex because latex could not be found

<Figure size 1800x1200 with 1 Axes>

In [218]:
s=env.reset()

In [219]:
env.env.printBoard()

-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------


In [227]:

s= env.reset()
terminate=False


In [234]:

    






print(s)        
action = agent.get_optimal_action(s)


s,raw_reward,terminate = env.step(action)

print(s,env.env.action_from_int(action),raw_reward)

env.env.printBoard()         

(1, 1, 2, 0, 0, 2, 0, 1, 2)
(1, 1, 2, 0, 0, 2, 0, 1, 2) (1, 2) -10
-------------
|   |   | x | 
-------------
| o | o | x | 
-------------
| o |   | x | 
-------------


In [1287]:
agent_cross.one_step(env)

AttributeError: 'TableQlearningAgent' object has no attribute 'one_step'

In [1038]:
env.env.printBoard()

-------------
|   |   | o | 
-------------
| x |   | x | 
-------------
| x | o |   | 
-------------


In [1039]:
agent_zeros.one_step(env)

(0, False)

In [1040]:
env.env.printBoard()

-------------
|   |   | o | 
-------------
| x | o | x | 
-------------
| x | o |   | 
-------------


# Часть 2. Нейронные сети

In [289]:
class DQN:
    def __init__(self, state_dim, action_dim):
        self.steps = 0 # Do not change
         
        self.replay_buffer=deque(maxlen=MEMORY_SIZE)

    
        
        self.policy_network = nn.Sequential( 
                                        nn.Linear(state_dim,64),
                                        nn.ReLU(),
                                        nn.Linear(64,64),
                                        nn.ReLU(),   
                                        nn.Linear(64,action_dim),
                                        ) 
        
        
        self.target_network=copy.deepcopy(self.policy_network)
        
        self.optimizer = torch.optim.Adam(self.policy_network.parameters(),lr=LEARNING_RATE)
#
        
        
    def consume_transition(self, transition):
        self.replay_buffer.append((torch.tensor(transition[0],dtype=torch.float32),
                                  torch.tensor([transition[1]]),
                                  torch.tensor(transition[2],dtype=torch.float32),
                                  torch.tensor([transition[3]]),
                                  torch.tensor([transition[4]]))
        )
       
    

    def sample_batch(self):        
        batch = random.sample(self.replay_buffer,BATCH_SIZE)
        
        return Transition(*zip(*batch))
        
    def train_step(self, batch):
        self.policy_network.train()
        self.target_network.eval()
        
        
        state_batch = torch.stack(batch.state)
        action_batch = torch.stack(batch.action)
        reward_batch = torch.stack(batch.reward)
        next_states = torch.stack(batch.next_state)
        dones = torch.stack(batch.done)*1

        
        q_targets_next = self.target_network(next_states).detach().max(1)[0].unsqueeze(1)
        q_targets = reward_batch + GAMMA * q_targets_next * (1 - dones)
        q_expected = self.policy_network(state_batch).gather(1, action_batch)
        
        loss = F.smooth_l1_loss(q_expected.float(), q_targets.float())
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_network.parameters():

            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
#
        

        
    def update_target_network(self):
        # Update weights of a target Q-network here. You may use copy.deepcopy to do this or 
        # assign a values of network parameters via PyTorch methods.
        for target_param, local_param in zip(self.target_network.parameters(), self.policy_network.parameters()):
            target_param.data.copy_(TAU*local_param.data + (1.0-TAU)*target_param.data)
        

    def act(self, state, target=False):
        # Compute an action. Do not forget to turn state to a Tensor and then turn an action to a numpy array.
        self.target_network.eval()
        self.policy_network.eval()
        
        action = torch.argmax(dqn.policy_network(torch.tensor(state,dtype=torch.float32))).item()
        return action

    def update(self, transition):
        
        self.consume_transition(transition)
        if self.steps % STEPS_PER_UPDATE == 0:
            batch = self.sample_batch()
            self.train_step(batch)
        if self.steps % STEPS_PER_TARGET_UPDATE == 0:
            self.update_target_network()
        self.steps += 1

    # def save(self,name=''):
    #     torch.save(self.policy_network.state_dict(), name+"agent.pkl")

In [302]:
def evaluate_policy(agent, episodes=5):
    
    
    returns = []
    total_reward=0
    for _ in range(episodes):
        done = False
        state = env.reset()
        
        
        while not done:
            state, reward, done = env.step(agent.act(state))
            total_reward += reward
        returns.append(total_reward)
    return returns

In [303]:
GAMMA = 0.99
INITIAL_STEPS = 1024
TRANSITIONS = 500000
STEPS_PER_UPDATE = 2
STEPS_PER_TARGET_UPDATE = STEPS_PER_UPDATE * 1
BATCH_SIZE = 64
LEARNING_RATE = 5e-4
MEMORY_SIZE=INITIAL_STEPS
TAU=1e-3
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward','done'))


In [304]:
dqn.policy_network(torch.tensor(state).float())

tensor([ -1.2345,  -9.3047, -10.1442,  -0.7584,  -2.6587,  -6.5407,  -6.5867,
         -4.4503,  -9.3494], grad_fn=<AddBackward0>)

In [305]:
env = SexyEnvironment(other_env)

In [306]:
env.sample()

1

In [312]:
dqn = DQN(state_dim=env.state_spaces, action_dim=env.action_space_dim)

state = env.reset()
EPS_DECAY=0.9
EPS_END=0.05
EPS_START=1
eps = EPS_START
max_reward = float('-inf')

for _ in range(INITIAL_STEPS):
    action = dqn.act(state)

    next_state, reward, done = env.step(action)
    dqn.consume_transition((state, action, next_state, reward, done))

    state = next_state if not done else env.reset()



for i in range(TRANSITIONS):
   
    if random.random() < eps:
        action = env.sample()
    else:
        action = dqn.act(state)

    eps = max(EPS_END, EPS_DECAY*eps)

    next_state, reward, done = env.step(action)
    dqn.update((state, action, next_state, reward, done))

    state = next_state if not done else env.reset()

    if i % 1000 == 0:
        rewards = evaluate_policy(dqn,10)
        
        print(f"Step: {i+1}, Reward mean: {np.mean(rewards)}, Reward std: {np.std(rewards)}")
        

Step: 1, Reward mean: -55.0, Reward std: 28.722813232690143
Step: 1001, Reward mean: -55.0, Reward std: 28.722813232690143
Step: 2001, Reward mean: -55.0, Reward std: 28.722813232690143
Step: 3001, Reward mean: -55.0, Reward std: 28.722813232690143
Step: 4001, Reward mean: -55.0, Reward std: 28.722813232690143
Step: 5001, Reward mean: -55.0, Reward std: 28.722813232690143
Step: 6001, Reward mean: -55.0, Reward std: 28.722813232690143
Step: 7001, Reward mean: 5.5, Reward std: 2.8722813232690143
Step: 8001, Reward mean: 5.5, Reward std: 2.8722813232690143
Step: 9001, Reward mean: -55.0, Reward std: 28.722813232690143
Step: 10001, Reward mean: -55.0, Reward std: 28.722813232690143
Step: 11001, Reward mean: 5.5, Reward std: 2.8722813232690143
Step: 12001, Reward mean: 5.5, Reward std: 2.8722813232690143
Step: 13001, Reward mean: 5.5, Reward std: 2.8722813232690143
Step: 14001, Reward mean: 5.5, Reward std: 2.8722813232690143
Step: 15001, Reward mean: 5.5, Reward std: 2.8722813232690143
Ste

KeyboardInterrupt: 

# TODO
state spaces - state dim