In [3]:
import argparse
import numpy as np
import pandas as pd
import random
from time import time
from collections import Counter
from GraphTsetlinMachine.graphs import Graphs
from GraphTsetlinMachine.tm import MultiClassGraphTsetlinMachine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Set random seed for reproducibility
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)

# Define default arguments using a class
class Args:
    def __init__(self, **kwargs):
        self.epochs = 50             
        self.number_of_clauses = 2000  
        self.T = 1500                 
        self.s = 1.2                
        self.depth = 3               
        self.hypervector_size = 1024
        self.hypervector_bits = 2
        self.message_size = 1024
        self.message_bits = 2
        self.double_hashing = True
        self.max_included_literals = 32
        # Update any kwargs passed
        for key, value in kwargs.items():
            setattr(self, key, value)

args = Args()

# Load data
start_time = time()
try:
    data = pd.read_csv('datasett/3x3.csv')
except FileNotFoundError:
    print("Error: 'datasett/3x3.csv' not found. Please ensure the dataset is in the correct directory.")
    exit(-1)
end_time = time()
print(f"Loading data took {end_time - start_time:.2f} seconds")

board_size = 3
# Define cell columns matching your dataset
cell_columns = [f'cell{row}_{col}' for row in range(board_size) for col in range(board_size)]  # ['cell0_0', 'cell0_1', ..., 'cell2_2']

# Ensure that 'winner' and 'starting_player' are in the data
required_columns = ['winner', 'starting_player'] + cell_columns
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Error: Missing columns in the dataset: {missing_columns}")
    exit(-1)

# Extract the board states and labels
X_df = data[cell_columns]
y = data['winner'].values.astype(int)
starting_player = data['starting_player'].values.astype(int)

# Handle missing values
if X_df.isnull().values.any():
    print("Warning: Missing values detected in X_df. Filling missing values with 0.")
    X_df = X_df.fillna(0)

# Map labels to 0 and 1 (if necessary)
unique_labels = np.unique(y)
if not set(unique_labels).issubset({0, 1}):
    label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
    y = np.array([label_mapping[label] for label in y])
    print("Labels mapped to:", label_mapping)

# Check class distribution
label_counts = Counter(y)
print(f"Class distribution: {label_counts}")

# Define test size as 20% of the data
test_size_dynamic = 0.2

# Split the data into training and testing sets
try:
    X_train_df, X_test_df, y_train, y_test, sp_train, sp_test = train_test_split(
        X_df, y, starting_player, test_size=test_size_dynamic, random_state=seed_value, stratify=y
    )
except ValueError as e:
    print(f"Error during train-test split: {e}")
    exit(-1)

print(f"X_train shape: {X_train_df.shape}")
print(f"X_test shape: {X_test_df.shape}")

# Convert y_train and y_test to pandas Series with matching indices
y_train = pd.Series(y_train, index=X_train_df.index)
y_test = pd.Series(y_test, index=X_test_df.index)
sp_train = pd.Series(sp_train, index=X_train_df.index)
sp_test = pd.Series(sp_test, index=X_test_df.index)

# Check class distribution in training and test sets
train_label_counts = Counter(y_train)
test_label_counts = Counter(y_test)
print("Training set class distribution:", train_label_counts)
print("Test set class distribution:", test_label_counts)

# Balance the training set if necessary
min_class_size = min(train_label_counts.values())
if min_class_size < max(train_label_counts.values()):
    class_indices = {}
    classes = np.unique(y_train)
    for cls in classes:
        cls_indices = y_train[y_train == cls].index
        class_indices[cls] = cls_indices

    # Sample min_class_size from each class
    selected_indices = []
    for cls, indices in class_indices.items():
        selected_cls_indices = np.random.choice(indices, min_class_size, replace=False)
        selected_indices.extend(selected_cls_indices)

    np.random.shuffle(selected_indices)

    X_train_df = X_train_df.loc[selected_indices].reset_index(drop=True)
    y_train = y_train.loc[selected_indices].reset_index(drop=True)
    sp_train = sp_train.loc[selected_indices].reset_index(drop=True)

    # Verify balanced class distribution
    train_label_counts = Counter(y_train)
    print("Balanced training set class distribution:", train_label_counts)
else:
    print("Training set is already balanced.")

# Ensure X_train_df and y_train have the same number of samples
assert X_train_df.shape[0] == y_train.shape[0], "Mismatch between X_train_df and y_train after balancing."

# Map cell values to symbols for GTM
value_to_symbol = {1: 'X', -1: 'O', 0: 'Empty'}

# Define symbols including additional node properties
symbol_names = ['X', 'O', 'Empty', 'StartingPlayer0', 'StartingPlayer1', 'Center', 'Edge', 'Corner']

print("Extended Symbol Names:", symbol_names)

# Helper function to map (q, r) to node_id
def position_to_edge_id(pos, board_size):
    return pos[0] * board_size + pos[1]

# Prepare the graph data
def prepare_graph_data(X_df, sp_series):
    num_graphs = X_df.shape[0]
    graphs = Graphs(
        number_of_graphs=num_graphs,
        symbols=symbol_names,
        hypervector_size=args.hypervector_size,
        hypervector_bits=args.hypervector_bits,
        double_hashing=args.double_hashing,
    )

    # Define nodes as (q, r) coordinates
    nodes = [(q, r) for q in range(board_size) for r in range(board_size)]
    node_id_map = {(q, r): idx for idx, (q, r) in enumerate(nodes)}

    # Define neighbor directions (8-connected grid)
    directions = [
        (0, 1),    # Up
        (1, 1),    # Up-Right
        (1, 0),    # Right
        (1, -1),   # Down-Right
        (0, -1),   # Down
        (-1, -1),  # Down-Left
        (-1, 0),   # Left
        (-1, 1),   # Up-Left
    ]

    # Prepare edges and count outgoing edges per node
    edges = []
    n_edges_list = [0 for _ in range(board_size**2)]  # Initialize list to count outgoing edges per node

    for q, r in nodes:
        for dq, dr in directions:
            neighbor_q = q + dq
            neighbor_r = r + dr
            if 0 <= neighbor_q < board_size and 0 <= neighbor_r < board_size:
                node_id = node_id_map[(q, r)]
                neighbor_id = node_id_map[(neighbor_q, neighbor_r)]
                if node_id < neighbor_id:  # Ensure each undirected edge is added only once
                    edges.append((node_id, neighbor_id))
                    n_edges_list[node_id] += 1  # Increment outgoing edge count

    # Set number of nodes and prepare configurations
    for graph_id in range(num_graphs):
        graphs.set_number_of_graph_nodes(graph_id=graph_id, number_of_graph_nodes=board_size**2)
    graphs.prepare_node_configuration()

    # Add nodes with their number of outgoing edges
    for graph_id in range(num_graphs):
        for k in range(board_size**2):
            graphs.add_graph_node(graph_id, k, n_edges_list[k])
    graphs.prepare_edge_configuration()

    # Add node properties and edges
    for graph_id in range(num_graphs):
        row = X_df.iloc[graph_id]
        board_state = row.values.astype(int)
        board_state_symbols = [value_to_symbol.get(cell_value, 'Empty') for cell_value in board_state]
        sp = sp_series.iloc[graph_id]

        for idx, (q, r) in enumerate(nodes):
            sym = board_state_symbols[idx]
            graphs.add_graph_node_property(graph_id, idx, sym)

            # Add starting player as node property
            graphs.add_graph_node_property(graph_id, idx, f'StartingPlayer{sp}')

            # Add position type (Center, Edge, Corner)
            if (q == 1 and r == 1):
                graphs.add_graph_node_property(graph_id, idx, 'Center')
            elif (q == 0 or q == 2) and (r == 0 or r == 2):
                graphs.add_graph_node_property(graph_id, idx, 'Corner')
            else:
                graphs.add_graph_node_property(graph_id, idx, 'Edge')

        # Add edges
        for node_id, neighbor_id in edges:
            graphs.add_graph_node_edge(graph_id, node_id, neighbor_id, edge_type_name=0)

    # Encode the graphs
    graphs.encode()
    return graphs

# Prepare training graphs
print("Preparing training graphs...")
graphs_train = prepare_graph_data(X_train_df, sp_train)
print("Training graphs created.")

# Prepare test graphs
print("Preparing test graphs...")
graphs_test = prepare_graph_data(X_test_df, sp_test)
print("Test graphs created.")

# Ensure y_train and y_test are numpy arrays of correct type
y_train = y_train.values.astype(np.int32).reshape(-1)
y_test = y_test.values.astype(np.int32).reshape(-1)

# Verify the shapes and data types
print("y_train shape:", y_train.shape, "dtype:", y_train.dtype)
print("y_test shape:", y_test.shape, "dtype:", y_test.dtype)

# Initialize the Tsetlin Machine with adjusted hyperparameters
try:
    tm = MultiClassGraphTsetlinMachine(
        args.number_of_clauses,
        args.T,
        args.s,
        len(np.unique(y_train)),  # number_of_classes as positional argument
        depth=args.depth,
        max_included_literals=args.max_included_literals,
        message_size=args.message_size,
        message_bits=args.message_bits,
        grid=(16*13,1,1),   # Adjust based on your GPU setup
        block=(128,1,1)
    )
except TypeError as e:
    print(f"Initialization Error: {e}")
    exit(-1)

# Training loop with adjusted hyperparameters
start_training = time()
for i in range(args.epochs):
    tm.fit(graphs_train, y_train, epochs=1, incremental=True)
    train_predictions = tm.predict(graphs_train)
    train_accuracy = np.mean(y_train == train_predictions)
    test_predictions = tm.predict(graphs_test)
    test_accuracy = np.mean(y_test == test_predictions)
    print(f"Epoch#{i+1} -- Accuracy train: {train_accuracy:.4f} -- Accuracy test: {test_accuracy:.4f}")
stop_training = time()
print(f"Training Time: {stop_training - start_training:.2f} seconds")

# Evaluation Metrics
print("\nClassification Report on Test Set:")
print(classification_report(y_test, test_predictions, digits=4))

print("Confusion Matrix on Test Set:")
print(confusion_matrix(y_test, test_predictions))

# Check predictions to see if the model is predicting all classes
print("\nChecking predictions on training and test sets...")
unique_train_preds, counts_train_preds = np.unique(train_predictions, return_counts=True)
print("Unique predictions on training set:", unique_train_preds)
print("Training set predictions distribution:", dict(zip(unique_train_preds, counts_train_preds)))

unique_test_preds, counts_test_preds = np.unique(test_predictions, return_counts=True)
print("Unique predictions on test set:", unique_test_preds)
print("Test set predictions distribution:", dict(zip(unique_test_preds, counts_test_preds)))

Loading data took 0.54 seconds
Class distribution: Counter({1: 500192, 0: 499808})
X_train shape: (800000, 9)
X_test shape: (200000, 9)
Training set class distribution: Counter({1: 400154, 0: 399846})
Test set class distribution: Counter({1: 100038, 0: 99962})
Balanced training set class distribution: Counter({0: 399846, 1: 399846})
Extended Symbol Names: ['X', 'O', 'Empty', 'StartingPlayer0', 'StartingPlayer1', 'Center', 'Edge', 'Corner']
Preparing training graphs...
Training graphs created.
Preparing test graphs...
Test graphs created.
y_train shape: (799692,) dtype: int32
y_test shape: (200000,) dtype: int32
Initialization of sparse structure.
Epoch#1 -- Accuracy train: 0.9983 -- Accuracy test: 0.9984
Epoch#2 -- Accuracy train: 1.0000 -- Accuracy test: 1.0000
Epoch#3 -- Accuracy train: 1.0000 -- Accuracy test: 1.0000


KeyboardInterrupt: 