# 4: Model Training

## Imports

In [1]:
from utils.model_utils import *
import torch.onnx

2024-09-04 19:17:09.208213: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Variables Declaration

In [2]:
LR = 0.1
loss_func = nn.BCELoss()
NUM_HANGING_VALUES = 10
EPOCHS = 30
BITBOARD_SHAPE = (76*2, 8, 8)
CHANGE_LEARNING_RATE = True
UPDATE_EPOCHS = [4, 10, 15]
RESIDUAL_BLOCKS = 6
RESIDUAL_FILTERS = 64
SE_RATIO = 8
MODEL_FILENAME = "./Models/PikeBot_Models/PikeBot.pth"
ONNX_FILENAME = "./Models/PikeBot_Models/PikeBot.onnx"
LOG_FILE_LOCATION = "./Training_Logs/Training.txt"
CHECKPOINT_FILENAME_LOCATION = "./Models/PikeBot_Models/PikeBot_checkpoint.pth"
TRAIN_GENERATOR_PATH = "./Generators/train_generator.pkl"
VAL_GENERATOR_PATH = "./Generators/val_generator.pkl"
TEST_GENERATOR_PATH = "./Generators/test_generator.pkl"
TEMP_STATE_PATH = "./Models/PikeBot_Models/temp_state_dict.pth"
OPSET_VERSION = 11

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
print(f"Detected Device: {device}")

Detected Device: cuda


## Model Sanity Check

In [3]:
model = ChessModel_V2(bit_board_shape=BITBOARD_SHAPE, num_float_inputs=NUM_HANGING_VALUES, residual_blocks=RESIDUAL_BLOCKS, residual_filters=RESIDUAL_FILTERS, se_ratio=SE_RATIO)
#model = Chess_Model(bit_board_shape=BITBOARD_SHAPE, num_float_inputs=NUM_HANGING_VALUES, channel_multiple=2)
num_params = count_parameters(model)
print("Number of parameters in the model in millions:", round(num_params/(1e6), 4))

Number of parameters in the model in millions: 3.8303


In [4]:
print(model)

ChessModel_V2(
  (relu): ReLU()
  (conv_block): ConvBlockV2(
    (conv): Conv2d(152, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (residual_blocks): Sequential(
    (0): ResidualBlockV2(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (se): SqueezeExcitationV2(
        (global_avg_pool): AdaptiveAvgPool2d(output_size=1)
        (fc1): Linear(in_features=64, out_features=8, bias=True)
        (fc2): Linear(in_features=8, out_features=128, bias=True)
      )
      (relu): ReLU()
    )
    (1): ResidualBlockV2(
      (conv1): Conv

In [5]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
board_shape = (1, BITBOARD_SHAPE[0], BITBOARD_SHAPE[1], BITBOARD_SHAPE[2])
floats_shape = (1, NUM_HANGING_VALUES)
input_bitboard = torch.tensor(np.random.rand(*board_shape), dtype = torch.float32)
input_floats = torch.tensor(np.random.rand(*floats_shape), dtype = torch.float32)
output = model(input_bitboard, input_floats)
output

tensor([[0.5322]], grad_fn=<SigmoidBackward0>)

## Model Training

In [6]:
train_generator = efficent_load_object(TRAIN_GENERATOR_PATH)
train_generator.__len__()

625695

In [7]:
val_generator = efficent_load_object(VAL_GENERATOR_PATH)
val_generator.__len__()

63153

In [8]:
test_generator = efficent_load_object(TEST_GENERATOR_PATH)
test_generator.__len__()

62784

In [9]:
del train_generator
del val_generator
del test_generator

In [10]:
model = model.to(device)
model = train(TRAIN_GENERATOR_PATH, VAL_GENERATOR_PATH, TEST_GENERATOR_PATH, model, optimizer, loss_func, NUM_HANGING_VALUES, EPOCHS, device,
              learning_rate=LR, log = 1, log_file = "./Training_Logs/Training.txt", verbose = 1, val = True, early_callback=False, early_callback_epochs=None,
              checkpoint=True, epochs_per_checkpoint=1, break_after_checkpoint=False, checkpoint_filename="./Models/PikeBot_Models/PikeBot_checkpoint.pth",
              change_learning_rate=CHANGE_LEARNING_RATE, update_epochs=UPDATE_EPOCHS)

______________________________________________________________
Epoch 0 Train Loss: 0.7871 | MSE: 0.5037 | MAE: 0.5037 | Accuracy: 0.4963
Epoch 0 Val Loss: 0.7871 | MSE: 0.5037 | MAE: 0.5037  | Accuracy: 0.4963
Epoch 1: Saving checkpoint...
______________________________________________________________
Epoch 1 Train Loss: 0.7871 | MSE: 0.5037 | MAE: 0.5037 | Accuracy: 0.4963
Epoch 1 Val Loss: 0.7871 | MSE: 0.5037 | MAE: 0.5037  | Accuracy: 0.4963
Epoch 2: Saving checkpoint...
______________________________________________________________
Epoch 2 Train Loss: 0.7871 | MSE: 0.5037 | MAE: 0.5037 | Accuracy: 0.4963
Epoch 2 Val Loss: 0.7871 | MSE: 0.5037 | MAE: 0.5037  | Accuracy: 0.4963
Epoch 3: Saving checkpoint...
______________________________________________________________
Epoch 3 Train Loss: 0.7871 | MSE: 0.5037 | MAE: 0.5037 | Accuracy: 0.4963
Epoch 3 Val Loss: 0.7871 | MSE: 0.5037 | MAE: 0.5037  | Accuracy: 0.4963
Epoch 4: Saving checkpoint...
Updated the learning rate, current learn

## Saving Model

In [11]:
model

ChessModel_V2(
  (relu): ReLU()
  (conv_block): ConvBlockV2(
    (conv): Conv2d(152, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
  )
  (residual_blocks): Sequential(
    (0): ResidualBlockV2(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (se): SqueezeExcitationV2(
        (global_avg_pool): AdaptiveAvgPool2d(output_size=1)
        (fc1): Linear(in_features=64, out_features=8, bias=True)
        (fc2): Linear(in_features=8, out_features=128, bias=True)
      )
      (relu): ReLU()
    )
    (1): ResidualBlockV2(
      (conv1): Conv

In [12]:
torch.save(model.state_dict(), TEMP_STATE_PATH)

In [14]:
model = ChessModel_V2(bit_board_shape=BITBOARD_SHAPE, num_float_inputs=NUM_HANGING_VALUES, residual_blocks=RESIDUAL_BLOCKS, residual_filters=RESIDUAL_FILTERS, se_ratio=SE_RATIO)
model.load_state_dict(torch.load(TEMP_STATE_PATH))
save_model(model, model_filename=MODEL_FILENAME, onnx_filename=ONNX_FILENAME,
           bitboard_input_shape=board_shape, hanging_values_input_shape=floats_shape, opset_version=OPSET_VERSION, device="cpu")

  model.load_state_dict(torch.load(TEMP_STATE_PATH))


Model saved successfully!


## Post-Training Sanity Check

In [15]:
model = torch.load(MODEL_FILENAME)
model = model.to(device)
model.eval()
test_model(model, loss_func=loss_func, num_hanging_values=NUM_HANGING_VALUES, device=device, test_generator_path=TEST_GENERATOR_PATH)

  model = torch.load(MODEL_FILENAME)


Testing Complete, Loss: 0.7871 | MSE: 0.5037 | MAE: 0.5037 | Accuracy: 0.4963
