# Set numerical variables

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time

import environment as env
import briscola_players as players
import simulate_games as sim_games
import module_next_states as next_states

import importlib

number_of_rounds = 20

semi = ['bastoni', 'coppe', 'denari', 'spade']
numeri = ['2', '3', '4', '5', '6', '7', '8', '9', '0', '1']
my_points = [ 0, 10, 0, 0, 0, 0, 2, 3, 4, 11]
my_cards = pd.DataFrame({i:[1 for j in numeri] for i in semi}, index = numeri)
my_cards['points'] = my_points
sorted_cards = my_cards.sort_values(by = ['points'], ascending = False)
sorted_cards

cards_in_string = [numero+seme for numero in numeri for seme in semi]

all_cards_in_strings = [i + j for i in numeri for j in semi]
all_cards_in_strings.append('None')
sorted_cards_in_string = sorted(all_cards_in_strings)
all_cards_in_strings_set = set(all_cards_in_strings)


from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes = sorted_cards_in_string)



## Training functions

In [3]:
loss_fn = tf.keras.losses.BinaryCrossentropy()
optimizer2 = tf.keras.optimizers.Adam(clipnorm = 1.)

def training(model, batch_of_games, target_q_val, past_loss = None):
    '''
    batch of games is of shape (batch_size, 20, #features for a game)
    '''
    batch_of_games = batch_of_games[:,:number_of_rounds,:]
    mask = create_mask(batch_of_games) 
    
    to_encode_the_games = np.reshape(batch_of_games, (batch_of_games.shape[0]*batch_of_games.shape[1], batch_of_games.shape[2]))
    
    #since the bach of games still contains ['pl 2 hand 1', 'pl 2 hand 2', 'pl 2 hand 3', 'played card'], we get rid of this columns using before_encoding_a_game 
    encode_games = next_states.encode_a_game(next_states.before_encoding_a_game(to_encode_the_games)) #(batch_size, 20, #features for encoded game)
    encode_games = np.reshape(encode_games, (batch_of_games.shape[0], batch_of_games.shape[1], encode_games.shape[1]))    
    with tf.GradientTape() as tape:
        _, all_Q_values = model(encode_games, my_return_sequences = True) #(batch_size, 20, 3)
        Q_values = all_Q_values*mask #(batch_size, 20, 3)
        final_q_values = tf.reduce_max(Q_values, axis = -1) #(batch_size, 20)
        transposed_final_q_val = tf.transpose(final_q_values)
        transposed_target = tf.transpose(target_q_val)
        loss =  loss_fn(transposed_target, transposed_final_q_val) #transposed is useful only if you want to put weights with sample_weight
        if past_loss != None:
            past_loss.append(loss)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer2.apply_gradients(zip(grads, model.trainable_variables))
    if past_loss != None:
        return past_loss
    
    
def create_mask(batch_of_games):
    '''
    Creates a mask of shape (batch_size, 20, 3).
    
    The entry [0,0,:] is [0.,1.,0.] means in the game at batch 0 at the hand 0 pl1 played card at position 2
    '''
    batch_of_games = batch_of_games[:,:number_of_rounds,:]
    def select_card(array):
        hand = array[1:4]
        card = array[10]
        location = np.where(hand == card)[0][0]
        return np.eye(3)[location]
    reshaped_batch = np.reshape(batch_of_games,
                                (batch_of_games.shape[0]*batch_of_games.shape[1], batch_of_games.shape[2]))
    which_action = np.apply_along_axis(select_card, -1, reshaped_batch)
    return np.reshape(which_action, (batch_of_games.shape[0], batch_of_games.shape[1], 3))[:,:number_of_rounds,:]
    
    
def select_batch_of_games(all_games, batch_size):
    '''
    Creates a batch of games of batch size.
    
    Drops the last round from each game (the one with the empty hand)
    '''
    number_of_games = int(all_games.shape[0]/(number_of_rounds+1))#Recall: Assumes there are 21 rounds
    reshaped_games = np.reshape(all_games, (number_of_games, number_of_rounds+1, all_games.shape[1]))
    if batch_size > number_of_games:
        return reshaped_games
    else:
        indices = np.random.choice(number_of_games, batch_size, replace = False)
        return reshaped_games[indices,:,:]
    
    
def get_previous_outputs(model, batch_of_games):#checked 1 time, returns correct shapes
    '''
    batch of games is of shape (batch_size, 20, #features for a game)
    '''    
    batch_of_games = batch_of_games[:,:number_of_rounds,:]
    to_encode_the_games = np.reshape(batch_of_games, (batch_of_games.shape[0]*batch_of_games.shape[1], batch_of_games.shape[2]))
    
    #since the bach of games still contains ['pl 2 hand 1', 'pl 2 hand 2', 'pl 2 hand 3', 'played card'], we get rid of this columns using before_encoding_a_game 
    encode_games = next_states.encode_a_game(next_states.before_encoding_a_game(to_encode_the_games)) #(batch_size, 20, #features for encoded game)
    encode_games = np.reshape(encode_games, (batch_of_games.shape[0], batch_of_games.shape[1], encode_games.shape[1]))
    
    initial_states, _ = model(encode_games, my_return_sequences = True)
    return initial_states #a list of initial_states of the form (batch_size, 20, result_gru_layers = 200)


def get_target_qvals(model, batch_of_games, previous_outputs):
    '''
    batch of games is of shape (batch_size, 20, #features for a game)
    
    returns result of shape (batch_size, 20)
    '''
    full_batch_of_games = batch_of_games
    batch_of_games = full_batch_of_games[:,:number_of_rounds,:]
    
    next_state = next_states.get_new_single_state_from_a_batch_of_games(full_batch_of_games) #(batch_size, 20, 1, #features for next_states)
    
    #encode next state
    to_encode_the_games = np.reshape(next_state, (next_state.shape[0]*next_state.shape[1]*next_state.shape[2],
                                                      next_state.shape[3]))
    encode_games = next_states.encode_a_game(to_encode_the_games) #(batch_size*20,  #features for encoded game)
    
    ##SHOULD BE UPDATED: reshape outputs previous hand to have shape #(batch_size, 20, 1, #features for next_states).
    #This is useful if to estimate the target q-val we use the average of the prob of winning for the next
    #50 states say, rather than next single (one) state. Now it is useless
    list_reshaped_outputs = []
    for output in previous_outputs:
        #repeated_output = tf.repeat(output, 50, axis = 1) useless now, se comment above
        reshaped_output = tf.reshape(output, (output.shape[0], output.shape[1], 1, output.shape[2]))
        list_reshaped_outputs.append(reshaped_output[:,:number_of_rounds,:,:])
    
    ##compute all q-vals
    list_reshaped_outputs_nn = []
    for output in list_reshaped_outputs:
        reshaped_previous_output_nn = tf.reshape(output, (previous_outputs[0].shape[0] * number_of_rounds, previous_outputs[0].shape[2]) )
        list_reshaped_outputs_nn.append(reshaped_previous_output_nn)
        
    reshaped_encode_games_nn = tf.reshape(encode_games, (encode_games.shape[0], 1, encode_games.shape[1]))
    _, q_vals = model(reshaped_encode_games_nn, initial_states = list_reshaped_outputs_nn)
    
    shape_q_vals = [next_state.shape[0], next_state.shape[1], next_state.shape[2], 3]
    all_q_vals = tf.reshape(q_vals, (shape_q_vals))
    
    ##compute max q-val
    #compute max along 3rd axis, shape = (batch_size, 20, 50)
    max_q_vals = tf.math.reduce_max(all_q_vals, axis = -1)
    
    #average along 2nd axis, shape = (batch_size, 20)
    target_q_vals = tf.math.reduce_mean(max_q_vals, axis = -1)
    return target_q_vals

In [4]:
#Cell to save the model with the custom final layer
class MyModel_save(tf.keras.Model):
    def __init__(self):
        super(MyModel_save, self).__init__()
        self.gru1 = tf.keras.layers.GRU(200, return_sequences = True)
        self.gru2 = tf.keras.layers.GRU(200, return_sequences = True)
        self.gru3 = tf.keras.layers.GRU(200, return_sequences = True)
        self.gru4 = tf.keras.layers.GRU(200, return_sequences = True)
        self.dense = tf.keras.layers.Dense(3, activation = "sigmoid")

    def call(self, state):
            h_1 = self.gru1(state)#initial states has shape (batch_size, 200) mortacci a tf che non lo scrive
            h_2 = self.gru2(h_1)
            h_3 = self.gru3(h_2)
            h_4 = self.gru4(h_3)
            final_output = self.dense(h_4[:,-1,:])
            return final_output

def my_save_weights(trained_model, location = 'location'):
    saving_model = MyModel_save()
    saving_model.compile()

    ## initialize weights
    a = np.linspace(1,100,250)[np.newaxis][np.newaxis]
    saving_model(a)

    ## set weights
    saving_model.set_weights(trained_model.get_weights())

    ## save model
    saving_model.save(location, save_format="tf")



def my_load_weights(to_be_trained_model, location = 'location'):
    ## initialize weights
    a = np.linspace(1,100,250)[np.newaxis][np.newaxis]
    to_be_trained_model(a)

    ##load weights
    loaded_model = tf.keras.models.load_model(location)

    ## set weights
    to_be_trained_model.set_weights(loaded_model.get_weights())
    return to_be_trained_model


## Main

Sample training loop. See README for the details on how my model was trained.

In [None]:
###Main, loss without sample weights
import matplotlib.pyplot as plt

player_1 = players.DeepPlayer
player_2 = players.DeterministicPlayer

my_model = players.MyModel()
my_model.compile()

model_copy = players.MyModel()
model_copy.compile()

partita = env.Briscola_env()
#print(sim_games.simulate_games(200, player_1, player_2, partita))

my_games = sim_games.simulate_games_and_record_data(100, player_1, player_2, partita, epsilon = .2, pl_1_model = my_model)
next_50_games = sim_games.simulate_games_and_record_data(50, player_1, player_2, partita, epsilon = .2, pl_1_model = model_copy)
my_games = np.concatenate([my_games,next_50_games], axis = 0)

my_model.set_weights(model_copy.get_weights())

#with these parameters the expected number of times a game is selected is 6. In general it is (b size/number of new games at each episode)

top_score = 0.132

past_loss = []
fractions_games_lost_vs_random = []
fractions_games_lost_vs_det = []
batch_size = 500


for episode in range(5000):
    if episode < 2500:
        next_50_games = sim_games.simulate_games_and_record_data(50, player_1, player_2, partita, epsilon = .2, pl_1_model = my_model)
    elif episode%10 == 0 and episode%500 > 100:            
        next_50_games = sim_games.simulate_games_and_record_data(50, player_1, player_2, partita, epsilon = .15 if episode < 7500 else .1, pl_1_model = my_model)
        my_games = np.concatenate([my_games,next_50_games], axis = 0)
    
    if my_games.shape[0] > 2100000:
        my_games = my_games[-2100000:,:]
        
    batch = select_batch_of_games(my_games, batch_size)
    
    prev_outputs = get_previous_outputs(model_copy, batch)
    
    tar_q_val = get_target_qvals(model_copy, batch, prev_outputs)
    past_loss = training(model_copy, batch, tar_q_val, past_loss)
    
    ##metrics
    if episode%50 == 0 and episode > 1:
        a = pd.DataFrame(past_loss, columns = ['loss']).rolling(window = 100).mean()
        my_plot = a.plot(figsize = (18,12))
        plt.show()
        
    if episode%500 == 0:
        my_model.set_weights(model_copy.get_weights())
        print('-----------------------------------------------')
        print('Episode:', episode)
        print('-----------------------------------------------')
        player1 = players.DeepPlayer
        randompl = players.RandomPlayer
        det_pl = players.DeterministicPlayer
        
        lost_vs_random = sim_games.simulate_games(500, player1, randompl, partita, pl_1_model = my_model)
        fractions_games_lost_vs_random.append(lost_vs_random)
        
        lost_vs_det_np = sim_games.simulate_games_and_record_data(500, player1, det_pl, partita, pl_1_model = my_model)
        my_games = np.concatenate([my_games,lost_vs_det_np], axis = 0)
        lost_vs_det = sim_games.how_many_lost_games(lost_vs_det_np)
        
        fractions_games_lost_vs_det.append(lost_vs_det)
        
        #Save the model if we outperform the previous model        
        if lost_vs_random < top_score:
            top_score = lost_vs_random
            print('-----------------------------------------------')
            print("New top score:", lost_vs_random)
            print('-----------------------------------------------')
            my_save_weights(my_model, 'model_top_accuracy')
            
        print('-----------------------------------------------')
        print('Last result vs random:', lost_vs_random)
        print('Last result vs det:', lost_vs_det)
        print('-----------------------------------------------')
        b = pd.DataFrame([fractions_games_lost_vs_random, fractions_games_lost_vs_det], index = ['lost vs random', 'lost vs det' ]).transpose()
        my_second_plot = b.plot(figsize = (18,12))
        my_second_plot.axhline(0.3)
        my_second_plot.axhline(0.2)
        my_second_plot.axhline(0.1)
        plt.show()
        
        if episode%500 == 0 and episode > 5000:
            c1 = pd.DataFrame([fractions_games_lost_vs_random, fractions_games_lost_vs_det], index = ['lost vs random', 'lost vs det' ]).transpose()
            c = c1.rolling(window = 10).mean()
            my_third_plot = c.plot(figsize = (18,12))
            my_third_plot.axhline(0.3)
            my_third_plot.axhline(0.2)
            my_third_plot.axhline(0.1)
            plt.show()