In [1]:
# %pip install tensorflow

In [101]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
# from tensorflow.keras import callbacks
 

import copy
import random
from json.encoder import INFINITY
import pandas as pd
import math
from ayo_game import play, is_illegal_move, assign_reward, print_game_play, end_game
from agents import random_agent as ra
from agents import minimax_agent as ma
from agents import mcts_agent as mctsa
seed = 37

randint = random.randint
random.seed(seed)

In [3]:
random_agent = ra.agent
minimax_agent = ma.agent
minimax_value = ma.get_action_value

In [90]:
def custom_train_test_split(data, test_size=0.2, shuffle=True, random_state=None):
    """
    Custom function to split data into training and test sets.

    Parameters:
    - X: Input features (numpy array or list)
    - y: Target labels (numpy array or list)
    - test_size: Proportion of the data to include in the test split (default 0.2)
    - shuffle: Whether to shuffle the data before splitting (default True)
    - random_state: Seed for the random number generator (optional, for reproducibility)

    Returns:
    - X_train, X_test, y_train, y_test: Split training and testing data
    """
    X = np.array([d[0] for d in data])
    y = np.array([d[1] for d in data])

    # Set random seed if provided (to ensure reproducibility)
    if random_state is not None:
        np.random.seed(random_state)

    # Get the number of samples
    num_samples = len(X)

    # Shuffle the data if requested
    if shuffle:
        indices = np.random.permutation(num_samples)
        X = X[indices]
        y = y[indices]

    # Compute the split index
    split_index = int(num_samples * (1 - test_size))

    # Split the data into training and testing sets
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    return X_train, X_test, y_train, y_test


In [211]:
class PolicyModel:
    def __init__(self):
        self.num_actions=12
        self.X_shape=14
        self.model = Sequential([
            Dense(64, input_dim=self.X_shape, activation='relu'),  # First hidden layer
            Dense(32, activation='relu'),  # Second hidden layer
            Dense(self.num_actions, activation='softmax')  # Output layer for 12 possible actions
        ])

        # Compile the model
        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'], )

    def train(self, training_examples):
                # Train the model (Assume more training data is available)
        # For example purposes, we'll use the same sample data multiple times
        X_train, X_test, y_train, y_test = custom_train_test_split(training_examples, test_size=0.3, shuffle=True, random_state=None)

        # Convert target (y) to categorical (for classification)
        num_actions = self.num_actions # Assuming 12 possible actions (0 to 11)
        y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_actions)
        y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_actions)

        callbacks = [EarlyStopping(patience=20, monitor='loss', verbose=0),
             ReduceLROnPlateau(monitor='val_accuracy',factor=0.01, min_Ir=0.00001, verbose=0),
             ModelCheckpoint('latest_policy_model.keras', verbose=0, save_best_only=True, save_weights_only=False)]

        # Train the model (Assume more training data is available)
        # For example purposes, we'll use the same sample data multiple times
        self.model.fit(X_train, y_train, epochs=200, batch_size=64,  callbacks=callbacks, validation_data=(X_test,y_test))


    def predict(self,data):
        prediction = self.model.predict(data)
        return prediction
    

class ValueModel:
    def __init__(self):
        self.X_shape= 14
        self.model = Sequential([
            Dense(64, input_dim=self.X_shape, activation='relu'),  # First hidden layer
            Dense(32, activation='relu'),  # Second hidden layer
            Dense(1)  # Output layer for 12 possible actions
        ])

        # Compile the model
        self.model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

    def train(self, training_examples):
        # Train the model (Assume more training data is available)
        # For example purposes, we'll use the same sample data multiple times
        X_train, X_test, y_train, y_test = custom_train_test_split(training_examples, test_size=0.3, shuffle=True, random_state=None)

        callbacks = [EarlyStopping(patience=20, monitor='loss', verbose=0),
             ReduceLROnPlateau(monitor='val_accuracy',factor=0.01, min_Ir=0.00001, verbose=0),
             ModelCheckpoint('latest_value_model.keras', verbose=0, save_best_only=True, save_weights_only=False)]

        self.model.fit(X_train, y_train, epochs=200, batch_size=64,  callbacks=callbacks, validation_data=(X_test,y_test))


    def predict(self,data):
        # Predict action for a new data point
        data = np.array(data)
        prediction = self.model.predict(data)
        return prediction


In [212]:
policy_model = PolicyModel()
value_model = ValueModel()

In [213]:
def format_state(state):
    x = []
    x.extend(state['board'])
    x.append(state['current_player'])
    x.append(state['player_territory'][1])

    return x

In [214]:

def remove_duplicates(arr):
    # Convert the list of lists to a NumPy array
    np_arr = np.array(arr, dtype=object)
    
    # Create a set to keep track of unique first elements (list of numbers)
    seen = set()
    unique_arr = []

    for item in np_arr:
        # Convert the list of numbers (first element) to a tuple so it can be added to a set
        num_tuple = tuple(item[0])
        if num_tuple not in seen:
            unique_arr.append(item)
            seen.add(num_tuple)
    
    return np.array(unique_arr, dtype=object)

In [215]:
def generate_action(state,agent_1, agent_2):
   if state['current_player'] == 0:
      func = agent_1['func']
      arg = agent_1['arg']
      return func(state, arg)

   if state['current_player'] == 1:
      func = agent_2['func']
      arg = agent_2['arg']
      return func(state, arg)

In [216]:
def execute_episode(state,  agent_1, agent_2,show=False):
    state = copy.deepcopy(state)
    reward = [0,0,0]
    path = []
    train_example_policy = []
    train_example_value = []

    while True:
        action = generate_action(state, agent_1, agent_2)
        
        if is_illegal_move(state, action):
            continue

        _, value = minimax_value(state, agent_2['arg'])
        train_example_value.append([format_state(state), value])
        train_example_policy.append([format_state(state), action])
        state, new_reward = play(state, action)
        reward = assign_reward(reward, new_reward)
        if show:
            print_game_play(state, reward, action)

        if end_game(state):
            break
        path.append(action)

    return  train_example_value, train_example_policy

In [217]:
def generate_training_data():
    num_of_eps = 100
    value_data = []
    policy_data = []
    state = {
   # 'board' :[6, 6, 2, 7, 1, 6, 1, 6, 6, 6, 0, 1],
   'board' :[4,4,4,4,4,4,4,4,4,4,4,4],
   'current_player': 0,
   'player_territory': (0,6)
    }
    
    agent_1 = {
    'func': random_agent,
    'arg': {},
    'name': 'random_agent',
    'elo': 1200
    }
        
    agent_2 = {
    'func': minimax_agent,
    'arg': {
        'max_dept': 9,
    },
    'name': 'minimax_agent_dept_3',
    'elo': 1200
    }

    for i in range(num_of_eps):
        state['current_player'] = i%2
        state_value_data, state_policy_data = execute_episode(state,  agent_1, agent_2)
        print('state_value_data: ', state_value_data)
        print('state_policy_data: ', state_policy_data)
        value_data.extend(state_value_data)
        policy_data.extend(state_policy_data)
        print((i/num_of_eps) * 100,'%')
        
    return remove_duplicates(value_data), remove_duplicates(policy_data)


In [None]:
value_data, policy_data = generate_training_data()
len(value_data)

state_value_data:  [[[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 6], 0], [[1, 6, 6, 2, 7, 1, 6, 1, 6, 6, 6, 0, 1, 6], -8], [[0, 8, 8, 0, 0, 2, 7, 2, 7, 7, 1, 2, 0, 6], 0], [[1, 8, 8, 0, 0, 0, 8, 0, 8, 8, 0, 3, 1, 6], -4], [[2, 9, 9, 1, 1, 0, 8, 0, 0, 9, 1, 0, 0, 6], 0], [[3, 10, 10, 1, 0, 1, 0, 1, 1, 10, 2, 1, 1, 6], -8], [[0, 10, 10, 1, 0, 1, 0, 1, 1, 10, 0, 2, 0, 6], 0], [[2, 2, 1, 3, 2, 3, 2, 3, 3, 12, 2, 1, 1, 6], -4], [[0, 3, 2, 0, 2, 3, 2, 3, 3, 12, 0, 2, 0, 6], 0], [[0, 3, 2, 0, 2, 0, 3, 0, 0, 12, 0, 2, 1, 6], -8], [[0, 1, 0, 1, 3, 1, 1, 2, 2, 1, 0, 0, 0, 6], -4], [[0, 1, 0, 1, 3, 0, 0, 3, 0, 2, 1, 1, 1, 6], -4], [[1, 1, 0, 1, 3, 0, 0, 0, 1, 3, 0, 2, 0, 6], -4], [[1, 1, 0, 1, 0, 1, 1, 1, 1, 3, 0, 2, 1, 6], -4], [[1, 1, 0, 1, 0, 1, 0, 0, 2, 0, 0, 2, 0, 6], 0], [[1, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 2, 1, 6], 0], [[1, 1, 0, 1, 0, 0, 0, 1, 2, 0, 0, 2, 0, 6], 0], [[0, 0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 2, 1, 6], 0], [[1, 1, 0, 1, 0, 2, 1, 0, 0, 1, 1, 0, 0, 6], 0], [[1, 1, 0, 0, 1, 2, 1, 0, 0, 1,

In [194]:
value_model.train(data)

Epoch 1/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.2765 - loss: 34.7690 - val_accuracy: 0.2227 - val_loss: 703.0892 - learning_rate: 0.0010
Epoch 2/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2632 - loss: 28.0122 - val_accuracy: 0.2227 - val_loss: 700.8851 - learning_rate: 0.0010
Epoch 3/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2591 - loss: 26.5041 - val_accuracy: 0.2227 - val_loss: 698.2056 - learning_rate: 0.0010
Epoch 4/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2856 - loss: 24.0992 - val_accuracy: 0.2227 - val_loss: 695.6357 - learning_rate: 0.0010
Epoch 5/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3035 - loss: 20.4567 - val_accuracy: 0.2185 - val_loss: 693.7120 - learning_rate: 0.0010
Epoch 6/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [195]:
# play games
# store outcome of games
# train model
data = [
    [6, 2, 7, 1, 6, 1, 6, 6, 6, 0, 1, 6, 1, 6],# -8
    [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 6],# 0
    [3, 1, 3, 0, 2, 1, 0, 2, 2, 0, 2, 0, 0, 6],# 0
    [0, 0, 0, 0, 1, 1, 0, 1, 3, 1, 0, 1, 1, 6],# -8
    [2, 0, 3, 1, 3, 1, 2, 2, 1, 2, 3, 12, 1, 6],# -8
]
# data = [[0, 0, 3, 11, 1, 1, 0, 1, 2, 11, 1, 1, 1, 6]]
value_model.predict(data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step


array([[-1.4147128 ],
       [-0.95667267],
       [ 2.140177  ],
       [-5.8980227 ],
       [-2.7277    ]], dtype=float32)

In [None]:
checkpoint_filepath = 'checkpoint.model.keras'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

# Model is saved at the end of every epoch, if it's the best seen so far.
model.fit(epochs=EPOCHS, callbacks=[model_checkpoint_callback])

# The model (that are considered the best) can be loaded as -
keras.models.load_model(checkpoint_filepath)

# Alternatively, one could checkpoint just the model weights as -
checkpoint_filepath = '/tmp/ckpt/checkpoint.weights.h5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

# Model weights are saved at the end of every epoch, if it's the best seen
# so far.
model.fit(epochs=EPOCHS, callbacks=[model_checkpoint_callback])

# The model weights (that are considered the best) can be loaded as -
model.load_weights(checkpoint_filepath)