# SURVIVAL RATE AGAINST ENGINE

This script is designed to evaluate the performance of a GPT-2 model (or other architecture) trained on chess notation to compete against an automated chess engine. We assess the survival rate by simulating multiple games where the model and the engine face off under controlled conditions.

## Key Objectives:

1. **Model Loading**: Load a pre-trained chess model from Hugging Face.
2. **Game Simulation**: Play a series of chess games between the model and the engine, recording game outcomes and move quality.
3. **Performance Metrics**: Analyze the games to calculate the Average Centipawn Loss (ACPL) for each side, providing insights into the typical move quality and strategic depth exhibited by the model compared to the engine. Additionally, we calculate the survival rate of the model against the engine, i.e. the number of plies the model survives before losing the game. (It is expected that the engine will win most games.)
4. **Statistical Analysis**: Compile and visualize the results to identify patterns and assess the model's consistency and robustness across games.
5. **Integration with W&B**: Log the results and metrics to Weights & Biases for tracking experiments and further analysis.

## Imports

In [None]:
from transformers import GPT2LMHeadModel
from src.chess_game import ChessGame

## Load Model from Hugging Face

* models: https://huggingface.co/collections/Leon-LLM/leon-llm-chess-models-6584387dbef870ffa4a7605f

In [None]:
# # modelname = "Leon-LLM/Leon-Chess-1M-BOS"  # Huggingface model name
# # modelname = "Leon-LLM/Leon-Chess-350k-Plus"
# modelname = "Leon-LLM/V63_GPT2_350k_4E_xLANplus_RIGHT_PAD"
# version = "V63"
# model = GPT2LMHeadModel.from_pretrained(modelname)

In [None]:
from transformers import AutoModelForCausalLM

model_id = "Leon-LLM/V63_GPT2_350k_4E_xLANplus_RIGHT_PAD"
# peft_model_id = "Leon-LLM/V64_LoRA_V63_GPT2-350k-Plus_10k_low_elo_4E_r64"
# peft_model_id = "Leon-LLM/V65_LoRA_V63_GPT2-350k-Plus_30k_low_elo_4E_r64"
# peft_model_id = "Leon-LLM/V66_LoRA_V63_GPT2-350k-Plus_98k_low_elo_4E_r64"
# peft_model_id = "Leon-LLM/V67_LoRA_V63_GPT2-350k-Plus_10k_high_elo_4E_r64"
# peft_model_id = "Leon-LLM/V68_LoRA_V63_GPT2-350k-Plus_98k_high_elo_4E_r64"
peft_model_id = "Leon-LLM/V69_LoRA_V63_GPT2-350k-Plus_30k_high_elo_4E_r64"
model = AutoModelForCausalLM.from_pretrained(model_id)
model.load_adapter(peft_model_id)
version = peft_model_id.split("/")[1].split("_")[0]
print(f"Model version: {version}")

## Play One Game

In [None]:
player1 = "model"  # select "player", "model", or "engine" for white
player2 = "model"  # select "player", "model", or "engine" for black
notation = "xLANplus"  # select "xLANplus", "xLAN", "xLANchk" or "xLANcap"

max_model_tries = 15
temperature = (
    0.1  # select a value between 0.1 and 2.0 to control the randomness of the model
)
show_game_history = False  # set to True to show the game history (takes up a lot of space) and keep the board for every move
mate_score = 100_000

game = ChessGame(
    player1_type=player1,
    player2_type=player2,
    model_p1=model,
    model_p2=model,
    notation=notation,
    max_model_tries=max_model_tries,
    temperature=temperature,
    starting_sequence="",
    show_game_history=show_game_history,
    show_output=True,
    manual_input=False,
    mate_score=mate_score,
    xlanplus=True,
)

In [None]:
game.play_game()

In [None]:
game.print_game_history()

In [None]:
# get stats
stats = (
    game.get_stats()
)  # self.movehistory, self.number_of_plies, self.outcome, self.board.result(), self.player_scores
stats

## Calculate Average Centipawn Loss

In [None]:
def calculate_acpl(scores_original, mate_score=100000):
    # make deep copy of scores (to avoid modifying the original list)
    scores = scores_original.copy()

    # remove any score that is close to `mate_score=100000` (to avoid counting the mate score in the ACPL calculation)
    threshold = 50  # threshold for removing scores close to mate score
    scores = [score for score in scores if abs(score) < mate_score - threshold]

    # Initialize variables
    white_losses = 0
    black_losses = 0
    white_moves = 0
    black_moves = 0

    # Loop through the scores
    for i in range(1, len(scores)):
        previous_score = scores[i - 1]
        current_score = scores[i]

        # Calculate the score difference
        score_diff = current_score - previous_score

        if i % 2 == 1:  # White's turn (1-based index is odd)
            white_moves += 1
            if score_diff < 0:  # Loss for White
                white_losses -= score_diff  # Subtract to make the loss positive
        else:  # Black's turn (1-based index is even)
            black_moves += 1
            if score_diff > 0:  # Loss for Black
                black_losses += score_diff

    # Calculate ACPL
    average_white_loss = white_losses / white_moves if white_moves else 0
    average_black_loss = black_losses / black_moves if black_moves else 0

    return average_white_loss, average_black_loss

In [None]:
scores = stats[4]

acl_white, acl_black = calculate_acpl(scores)

print(f"Average Centipawn Loss for {player1} (White) = {acl_white}")
print(f"Average Centipawn Loss for {player2} (Black) = {acl_black}")

## Play many games

In [None]:
from IPython.display import clear_output

# play n games and collect stats
n = 100
stats = []

# Config

player1 = "model"  # select "player", "model", or "engine" for white
player2 = "model"  # select "player", "model", or "engine" for black
notation = "xLANplus"  # select "xLANplus", "xLAN", "xLANchk" or "xLANcap"

max_model_tries = 15
temperature = (
    0.1  # select a value between 0.1 and 2.0 to control the randomness of the model
)
show_game_history = False  # set to True to show the game history (takes up a lot of space) and keep the board for every move
mate_score = 100_000

for i in range(n):
    game = ChessGame(
        player1_type=player1,
        player2_type=player2,
        model_p1=model,
        model_p2=model,
        notation=notation,
        max_model_tries=max_model_tries,
        temperature=temperature,
        starting_sequence="",
        show_game_history=False,
        show_output=False,
        manual_input=False,
        mate_score=mate_score,
        xlanplus=True,
    )

    game.play_game()

    stats.append(list(game.get_stats()))
    clear_output(wait=True)
    print(f"Game {i+1}/{n} finished ({game.outcome})")

## Visualize Results

In [None]:
base_path = "results/survival_rate/survival_rate_temp01"
player1_name = f"{version if player1 == 'model' else 'stockfish'}"
player2_name = f"{version if player2 == 'model' else 'stockfish'}"
name = f"{player1_name}_vs_{player2_name}_{n}_games"
print(base_path + "/" + name)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

results_csv = pd.DataFrame(
    stats,
    columns=["Move History", "Number of Plies", "Outcome", "Result", "Player Scores"],
)

# Calculate ACPL for each game
acl_white_list = []
acl_black_list = []

for game_stats in stats:
    scores = game_stats[4]
    acl_white, acl_black = calculate_acpl(scores)
    acl_white_list.append(acl_white)
    acl_black_list.append(acl_black)
    results_csv.loc[stats.index(game_stats), "ACPL White"] = acl_white
    results_csv.loc[stats.index(game_stats), "ACPL Black"] = acl_black

export_path_csv = f"{base_path}/{name}_stats.csv"
results_csv.to_csv(export_path_csv, index=False)

# Calculate average ACPL
average_acl_white = sum(acl_white_list) / n
average_acl_black = sum(acl_black_list) / n

# Variance
variance_acl_white = sum((acl - average_acl_white) ** 2 for acl in acl_white_list) / n
variance_acl_black = sum((acl - average_acl_black) ** 2 for acl in acl_black_list) / n

# Standard deviation
std_dev_acl_white = variance_acl_white**0.5
std_dev_acl_black = variance_acl_black**0.5

# Max and min ACPL
max_acl_white = max(acl_white_list)
max_acl_black = max(acl_black_list)

min_acl_white = min(acl_white_list)
min_acl_black = min(acl_black_list)

# Creating a pandas DataFrame to display the table

df_stats = pd.DataFrame(
    {
        "Statistic": [
            "Average Centipawn Loss",
            "Variance of Centipawn Loss",
            "Standard Deviation of Centipawn Loss",
            "Maximum Centipawn Loss",
            "Minimum Centipawn Loss",
        ],
        f"White ({player1_name})": [
            average_acl_white,
            variance_acl_white,
            std_dev_acl_white,
            max_acl_white,
            min_acl_white,
        ],
        f"Black ({player2_name})": [
            average_acl_black,
            variance_acl_black,
            std_dev_acl_black,
            max_acl_black,
            min_acl_black,
        ],
    }
)

# Exporting the table to a CSV file
export_path_table = f"{base_path}/{name}_acpl_table.csv"
df_stats.to_csv(export_path_table, index=False)

# Displaying the table
display(df_stats)

# Plot histogram of ACPL
plt.hist(acl_white_list, bins=20, color="blue", alpha=0.7, label="White")
plt.hist(acl_black_list, bins=20, color="red", alpha=0.7, label="Black")
plt.xlabel("Average Centipawn Loss")
plt.ylabel("Frequency")
plt.title("Histogram of Average Centipawn Loss")
plt.legend()
export_path_hist = f"{base_path}/{name}_histogram.png"
plt.tight_layout()
plt.savefig(export_path_hist)
plt.show()

In [None]:
# create bar plot of results (only showing categories in which there are games)

import matplotlib.pyplot as plt
import numpy as np

outcomes = [s[2] for s in stats]

unique, counts = np.unique(outcomes, return_counts=True)

plt.bar(unique, counts)
plt.ylabel("Number of games")
plt.xlabel("Outcome")
plt.xticks(unique)
plt.xticks(unique, rotation=45)
plt.title("Outcome of games")
plt.tight_layout()
plt.savefig(f"{base_path}/{name}_outcome.png")
plt.show()

print("Number of games won by checkmate:", outcomes.count("checkmate"))
print("Number of games won by invalid_move:", outcomes.count("invalid_move"))

In [None]:
# create bar plot of results (showing all categories)
import matplotlib.pyplot as plt
import numpy as np

outcomes = [s[2] for s in stats]
# possible results: 'invalid_move', 'checkmate', 'stalemate', 'insufficient_material', '75_move_rule', 'fivefold_repetition', 'unknown'
# all of the results should be shown in the plot (even if they are not present in the data)

unique_outcomes = [
    "invalid_move",
    "checkmate",
    "stalemate",
    "insufficient_material",
    "75_move_rule",
    "fivefold_repetition",
    "unknown",
]

outcomes_dict = {outcome: outcomes.count(outcome) for outcome in unique_outcomes}

fig, ax = plt.subplots()
ax.bar(outcomes_dict.keys(), outcomes_dict.values())
ax.set_ylabel("Number of games")
ax.set_title("Outcome of Games")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f"{base_path}/{name}_outcome_all.png")
plt.show()

# print numbers of games for each outcome
for outcome, count in outcomes_dict.items():
    print(f"Number of games won by {outcome}: {count}")

In [None]:
# create bar plot of number of moves (ply) per game
number_of_plies_until_end_of_game = [s[1] for s in stats]
unique, counts = np.unique(number_of_plies_until_end_of_game, return_counts=True)

plt.bar(unique, counts)
plt.ylabel("Number of games")
plt.xlabel("Number of moves until end of game")
plt.title("Length of games")
plt.savefig(f"{base_path}/{name}_length.png")
plt.show()

# print average number of moves per game
average_number_of_moves = sum(number_of_plies_until_end_of_game) / n
print(f"Average number of moves per game: {average_number_of_moves}")

In [None]:
# create bar plot of number of winners
# all the possible winners will be displayed (even if they are not present in the data)

results = [s[3] for s in stats]

unique = ["0-1", "1-0", "1/2-1/2", "*"]
counts = [results.count(u) for u in unique]

plt.bar(unique, counts)
plt.ylabel("Number of games")
plt.xlabel("Winner")
plt.title("Game Results")
plt.savefig(f"{base_path}/{name}_results.png")
plt.show()

# print numbers of games for each result
for i, count in enumerate(counts):
    print(f"Number of games won by {unique[i]}: {count}")

## Log Results to W&B

In [None]:
# import wandb

# # Guide for Resuming Runs: https://docs.wandb.ai/guides/runs/resuming

# PROJECT_NAME = "Leon LLM"
# RUN_ID = "ee63i95h" # The `run_id` of the model can be found in the 'Overview' panel of a run under 'Run Path'.

# # Initialize W&B run
# wandb.init(project=PROJECT_NAME, id=RUN_ID, resume="must")

In [None]:
# ## package the stats into a dictionary and check for correctness

# # TODO: define the stats_dict dictionary
# stats_dict = {
#     "average_acl_white": average_acl_white,
#     "average_acl_black": average_acl_black,
#     "variance_acl_white": variance_acl_white,
#     "variance_acl_black": variance_acl_black,
#     "std_dev_acl_white": std_dev_acl_white,
#     "std_dev_acl_black": std_dev_acl_black,
#     "max_acl_white": max_acl_white,
#     "max_acl_black": max_acl_black,
#     "min_acl_white": min_acl_white,
#     "min_acl_black": min_acl_black,
#     "number_of_games": n,
#     "stats": stats,
# }
# stats_dict

In [None]:
# Logging new evaluation metrics
# wandb.log(stats_dict)

In [None]:
# wandb.finish()

In [None]:
# make a list of csv files in a directory
paths = [
    "./results/survival_rate/survival_rate_temp01/V63",
    "./results/survival_rate/survival_rate_temp01/V64",
    "./results/survival_rate/survival_rate_temp01/V65",
    "./results/survival_rate/survival_rate_temp01/V66",
    "./results/survival_rate/survival_rate_temp01/V67",
    "./results/survival_rate/survival_rate_temp01/V68",
    "./results/survival_rate/survival_rate_temp01/V69",
]
# recursively search for files ending in "*acpl_table.csv"
csv_files = []
for path in paths:
    # print(f"path = {path}")
    for root, dirs, files in os.walk(path):
        for file in files:
            # print(f"file = {file}")
            if file.endswith("games_stats.csv"):
                csv_files.append(os.path.join(root, file))

print(csv_files)

# each of these files is a csv file containint a *column* called "Outcome"
# extract the outcomes from each of these files

for csv_file in csv_files:
    # if csv_file contains "V" twice, skip it
    if csv_file.count("V") != 2:
        continue
    title = "_".join(csv_file.split("/")[5].split("_")[:3])
    print(f"title = {title}")
    with open(csv_file, "r") as f:
        # put csv in pandas dataframe
        import pandas as pd

        df = pd.read_csv(f)
        # extract the "Outcome" column
        outcomes = df["Outcome"]
        # for every unique value in outcomes, count the number of times it appears
        counts = outcomes.value_counts()
        # calculate percentages
        percentages = counts / counts.sum() * 100
        # print the percentages
        print(f"percentages = {percentages}\n")