In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split


import pickle
import pandas as pd
import numpy as np
from IPython.display import clear_output
import time
import random
import matplotlib.pyplot as plt

# Import data and fix up for transformer

In [2]:
# Replace 'your_file.pkl' with the path to your pickle file
file_path = "Connect4Dataset_SmartRandom_carson.pkl"

# Open the pickle file in read mode
with open(file_path, "rb") as file:
    dataset_progressive_skill = pickle.load(file)

# Convert the dataset to a DataFrame, including board state
data = []
for record in dataset_progressive_skill:
    game_id, board, move, player1_skill, player2_skill, player = record
    data.append(
        {
            "Game ID": game_id,
            "Board State": board,
            "Move": move,
            "Player 1 Skill": player1_skill,
            "Player 2 Skill": player2_skill,
            "Player": player,
        }
    )

connect4 = pd.DataFrame(data)

In [9]:
connect4

Unnamed: 0,Game ID,Board State,Move,Player 1 Skill,Player 2 Skill,Player
0,0,"[[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, ...",3,1748,2477,plus
1,0,"[[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, ...",2,1748,2477,minus
2,0,"[[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, ...",2,1748,2477,plus
3,0,"[[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, ...",3,1748,2477,minus
4,0,"[[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, ...",2,1748,2477,plus
...,...,...,...,...,...,...
118689,3999,"[[[1, 0], [0, 1], [0, 0], [1, 0], [0, 1], [0, ...",5,1872,2744,minus
118690,3999,"[[[1, 0], [0, 1], [0, 0], [1, 0], [0, 1], [0, ...",5,1872,2744,plus
118691,3999,"[[[1, 0], [0, 1], [0, 0], [1, 0], [0, 1], [0, ...",5,1872,2744,minus
118692,3999,"[[[1, 0], [0, 1], [0, 0], [1, 0], [0, 1], [0, ...",2,1872,2744,plus


In [10]:
def standardize_board_state(board, player):
    """
    Swap [0,1] to [1,0] and [1,0] to [0,1] in a given board only if player == -1.
    """

    # if isinstance(player, str):
    #     player = int(player)

    if player == "minus":
        board = np.array(board)  # Ensure it's a NumPy array

        # Create masks
        mask_01 = (board[:, :, 0] == 0) & (board[:, :, 1] == 1)  # Find [0,1]
        mask_10 = (board[:, :, 0] == 1) & (board[:, :, 1] == 0)  # Find [1,0]

        # Swap values
        board[mask_01] = [1, 0]
        board[mask_10] = [0, 1]

    return board  # Return the modified or original board


# Apply to each board where the player is -1
connect4_standardized = connect4.copy()
connect4_standardized["Board State"] = connect4_standardized.apply(
    lambda row: standardize_board_state(row["Board State"], row["Player"]), axis=1
)

In [13]:
def flip_board(board):
    """Flip the board horizontally."""
    return np.flip(board, axis=1)


def add_flipped_boards(df):
    """
    Add flipped boards to the dataset, updating the Game ID for flipped boards.
    Args:
        df: A Pandas DataFrame with columns "Game ID", "Board State", "Move",
             "Player 1 Skill", "Player 2 Skill", and "Player".
    Returns:
        A new DataFrame with original and flipped boards, where flipped boards have updated Game IDs.
    """
    flipped_rows = []
    max_game_id = df[
        "Game ID"
    ].max()  # Start new Game IDs after the max ID in the original DataFrame

    for game_id, group in df.groupby("Game ID"):  # Group by each game
        for _, row in group.iterrows():
            # Extract board state, move, and other columns
            board = row["Board State"]
            move = row["Move"]
            other_columns = row.drop(
                ["Game ID", "Board State", "Move"]
            ).to_dict()  # Extract other columns

            # Add the original board state
            flipped_rows.append(
                {
                    "Game ID": game_id,  # Keep the original Game ID
                    "Board State": board,
                    "Move": move,
                    **other_columns,
                }
            )

            # Flip the board and adjust the move
            flipped_board = flip_board(board)
            flipped_move = 6 - move  # Adjust move for flipped board

            # Add the flipped board state with a new Game ID
            flipped_rows.append(
                {
                    "Game ID": max_game_id
                    + 1,  # Increment Game ID for flipped board states
                    "Board State": flipped_board,
                    "Move": flipped_move,
                    **other_columns,
                }
            )

        # Increment the max_game_id for the next game
        max_game_id += 1

    # Create a new DataFrame from the rows
    return pd.DataFrame(flipped_rows)


# Apply the function to your DataFrame
connect4_wflip = add_flipped_boards(connect4_standardized)
connect4_wflip.shape

(237388, 6)

# Transformer architecture

In [14]:
class PositionalIndex(tf.keras.layers.Layer):
    def call(self, x):
        bs = tf.shape(x)[0]  # extract batch size
        number_of_vectors = tf.shape(x)[
            1
        ]  # how many vectors - we know it should be m*n, but let's count to be sure...
        indices = tf.range(number_of_vectors)  # index for each vector
        indices = tf.expand_dims(indices, 0)  # reshape it appropriately
        return tf.tile(indices, [bs, 1])  # repeat for each batch

In [15]:
class ClassTokenIndex(tf.keras.layers.Layer):
    def call(self, x):
        bs = tf.shape(x)[0]  # extract batch size
        number_of_vectors = 1  # how many vectors - we just want 1 (which is @ 0) ... we only want to generate 1 vector for the class token
        # now just get it to be the right size
        indices = tf.range(number_of_vectors)  # index for each vector
        indices = tf.expand_dims(indices, 0)  # reshape it appropriately
        return tf.tile(indices, [bs, 1])  # repeat for each batch

In [16]:
def build_ViT(
    n,
    m,
    block_size,
    hidden_dim,
    num_layers,
    num_heads,
    key_dim,
    value_dim,
    mlp_dim,
    dropout_rate,
    num_classes,
):
    # n is number of rows of blocks
    # m is number of cols of blocks
    # block_size is number of pixels (with rgb) in each block
    inp = tf.keras.layers.Input(shape=(n * m, block_size))
    mid = tf.keras.layers.Dense(hidden_dim)(
        inp
    )  # transform to vectors with different dimension
    # the positional embeddings
    inp2 = PositionalIndex()(inp)
    emb = tf.keras.layers.Embedding(input_dim=n * m, output_dim=hidden_dim)(
        inp2
    )  # learned positional embedding for each of the n*m possible possitions
    mid = tf.keras.layers.Add()(
        [mid, emb]
    )  # for some reason, tf.keras.layers.Add causes an error, but + doesn't?
    # create and append class token to beginning of all input vectors
    tokenInd = ClassTokenIndex()(mid)
    token = tf.keras.layers.Embedding(input_dim=1, output_dim=hidden_dim)(tokenInd)
    mid = tf.keras.layers.Concatenate(axis=1)([token, mid])

    for l in range(num_layers):  # how many Transformer Head layers are there?
        ln = tf.keras.layers.LayerNormalization()(mid)  # normalize
        mha = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=key_dim, value_dim=value_dim
        )(
            ln, ln, ln
        )  # self attention!
        add = tf.keras.layers.Add()([mid, mha])  # add and norm
        ln = tf.keras.layers.LayerNormalization()(add)
        den = tf.keras.layers.Dense(mlp_dim, activation="gelu")(
            ln
        )  # maybe should be relu...who knows...
        den = tf.keras.layers.Dropout(dropout_rate)(den)  # regularization
        den = tf.keras.layers.Dense(hidden_dim)(
            den
        )  # back to the right dimensional space
        den = tf.keras.layers.Dropout(dropout_rate)(den)
        mid = tf.keras.layers.Add()([den, add])  # add and norm again

    fl = mid[:, 0, :]  # just grab the class token for each image in batch
    ln = tf.keras.layers.LayerNormalization()(fl)
    clas = tf.keras.layers.Dense(num_classes, activation="softmax")(
        ln
    )  # probability that the image is in each category
    mod = tf.keras.models.Model(inp, clas)
    mod.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    return mod

In [33]:
n = 6
m = 7
block_size = 2
hidden_dim = 100
num_layers = 16
num_heads = 4
key_dim = (
    hidden_dim // num_heads
)  # usually good practice for key_dim to be hidden_dim//num_heads...this is why we do Multi-Head attention
value_dim = key_dim * 2
mlp_dim = hidden_dim
dropout_rate = 0.05
num_classes = 7


trans = build_ViT(
    n,
    m,
    block_size,
    hidden_dim,
    num_layers,
    num_heads,
    key_dim,
    value_dim,
    mlp_dim,
    dropout_rate,
    num_classes,
)
trans.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 42, 2)]      0           []                               
                                                                                                  
 positional_index_5 (Positional  (None, 42)          0           ['input_6[0][0]']                
 Index)                                                                                           
                                                                                                  
 dense_114 (Dense)              (None, 42, 100)      300         ['input_6[0][0]']                
                                                                                                  
 embedding_10 (Embedding)       (None, 42, 100)      4200        ['positional_index_5[0][0]'

In [34]:
(
    ((hidden_dim * key_dim + key_dim) * 2 + hidden_dim * value_dim + value_dim)
    * num_heads
    + (value_dim * num_heads) * hidden_dim
    + hidden_dim
)

60500

# Prep data for testing

In [35]:
def subset_games(df, num_games):
    """
    Subset a DataFrame based on a random sample of unique game IDs.

    Parameters:
        df (pd.DataFrame): The input DataFrame with a 'game id' column.
        num_games (int): The number of unique game IDs to subset.

    Returns:
        pd.DataFrame: A subset of the original DataFrame.
    """
    # Get the unique game IDs
    unique_game_ids = df["Game ID"].unique()

    # Ensure the requested number of games doesn't exceed the available unique IDs
    if num_games > len(unique_game_ids):
        raise ValueError(
            f"num_games ({num_games}) exceeds the total unique game IDs ({len(unique_game_ids)})."
        )

    # Randomly sample the desired number of unique game IDs
    sampled_game_ids = pd.Series(unique_game_ids).sample(num_games, random_state=42)

    # Subset the DataFrame
    subset_df = df[df["Game ID"].isin(sampled_game_ids)]

    return subset_df


num_games = 6000  # Specify the number of games to include
connect4_testing = subset_games(connect4_wflip, num_games)

# Print some details
print(f"Subset contains {len(connect4_testing)} board states from {num_games} games.")

Subset contains 177902 board states from 6000 games.


In [36]:
# Extract features (board states) and labels (recommended moves)
X = np.stack(
    connect4_testing["Board State"].values
)  # Convert board states into a NumPy array
y = connect4_testing["Move"].values

# Normalize the board states (optional for CNNs)
X = X.astype("float32") / 1.0  # Assuming board states are binary (0 or 1)

# Convert labels to one-hot encoding (required for multi-class classification)
num_classes = 7  # Moves are in columns 0-6
y = to_categorical(y, num_classes)

# Split into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(
    f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}"
)


ndata_train = X_train.shape[0]
ndata_test = X_test.shape[0]

Training set: (124531, 6, 7, 2), Validation set: (26685, 6, 7, 2), Test set: (26686, 6, 7, 2)


In [37]:
y_train = np.argmax(y_train, axis=1)  # Convert from one-hot to class indices
y_test = np.argmax(y_test, axis=1)  # Convert from one-hot to class indices

In [38]:
x_train_ravel = np.zeros((ndata_train, n * m, block_size))
for img in range(ndata_train):
    ind = 0
    for row in range(n):
        for col in range(m):
            x_train_ravel[img, ind, :] = X_train[
                img,
                row,
                col,
                :,
                # img, (row * 4) : ((row + 1) * 4), (col * 4) : ((col + 1) * 4)
            ].ravel()
            ind += 1

In [39]:
x_train_ravel.shape

(124531, 42, 2)

In [40]:
x_test_ravel = np.zeros((ndata_test, n * m, block_size))
for img in range(ndata_test):
    ind = 0
    for row in range(n):
        for col in range(m):
            x_test_ravel[img, ind, :] = X_test[
                img,
                row,
                col,
                :,
                # img, (row * 4) : ((row + 1) * 4), (col * 4) : ((col + 1) * 4)
            ].ravel()
            ind += 1

### Train 

In [41]:
trans.fit(x_train_ravel, y_train, epochs=10, batch_size=40, validation_split=0.15)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x35fb06a50>

In [30]:
out = trans.evaluate(x_test_ravel, y_test)



In [31]:
out[1]

0.4313497841358185