In [None]:
#load the packages
import numpy as np
import sklearn.model_selection
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import pandas as pd
import json
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.optim as optim
import torchvision.transforms as transforms
from collections import Counter
from pathlib import Path
from io import BytesIO
from torch.utils.data import Dataset, DataLoader

In [14]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import json
from tqdm import tqdm

# ============================================================
#                 CONFIGURATION
# ============================================================
CSV_PATH = "all_games.csv"  # Your dataset

LIST_OF_PLAYERS = [
    "Carlsen, Magnus",
    "Firouzja, Alireza",
    "Caruana, Fabiano",
    "Nepomniachtchi, Ian",
    "Cramling Bellon, Anna",
    "Giri, Anish",
    "Niemann, Hans Moke",
    "Cramling, Pia",
    "Nakamura, Hikaru",
    "Botez, Alexandra",
    "Botez, Andrea",
    "Belenkaya, Dina",
    "So, Wesley",
]

# ---- Only use FIRST 4 players ----
PLAYERS = LIST_OF_PLAYERS[:4]

MAX_LEN = 60
BATCH_SIZE = 256
EMB_DIM = 128
HIDDEN_DIM = 128
EPOCHS = 30
LR = 2e-3

# ============================================================
#                 LOAD CSV
# ============================================================
print("Loading CSV...")
df = pd.read_csv(CSV_PATH)

df["white_name"] = df["white_name"].astype(str)
df["black_name"] = df["black_name"].astype(str)

# ============================================================
#                 LABEL CONSTRUCTION (5 players only)
# ============================================================
def extract_player(name):
    for p in PLAYERS:
        if p.lower() in name.lower():
            return p
    return None

df["label"] = df.apply(
    lambda row: extract_player(row["white_name"]) or extract_player(row["black_name"]),
    axis=1,
)

# Keep ONLY games involving the first 5 players
df = df[df["label"].notna()].reset_index(drop=True)

print("Games left after filtering:", len(df))
print(df["label"].value_counts())

# ============================================================
#                 MOVE TOKENIZATION
# ============================================================
print("Building vocabulary...")

all_moves = []
for moves in df["list_of_moves"]:
    moves = moves.strip("[]").replace("'", "").replace(",", "")
    tokens = moves.split()
    all_moves.extend(tokens)

move_counts = Counter(all_moves)

vocab = {"<PAD>": 0, "<UNK>": 1}
for move in move_counts:
    vocab[move] = len(vocab)

with open("vocab.json", "w") as f:
    json.dump(vocab, f)

# ============================================================
#                 DATASET CLASS
# ============================================================
label2id = {name: i for i, name in enumerate(PLAYERS)}
id2label = {i: name for name, i in label2id.items()}

def encode_moves(moves):
    tokens = moves.strip("[]").replace("'", "").replace(",", "").split()
    idxs = [vocab.get(t, 1) for t in tokens][:MAX_LEN]
    while len(idxs) < MAX_LEN:
        idxs.append(0)
    return idxs

class ChessDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        moves = encode_moves(row["list_of_moves"])
        label = label2id[row["label"]]
        return torch.tensor(moves, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# ============================================================
#                 TRAIN/VAL/TEST SPLIT
# ============================================================
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42, shuffle=True)

train_loader = DataLoader(ChessDataset(train_df), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(ChessDataset(val_df),   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(ChessDataset(test_df),  batch_size=BATCH_SIZE, shuffle=False)

# ============================================================
#                 MODEL
# ============================================================
class ChessRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, h = self.rnn(x)
        h = self.dropout(h.squeeze(0))
        return self.fc(h)

model = ChessRNN(len(vocab), EMB_DIM, HIDDEN_DIM, len(label2id))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# ============================================================
#                 TRAINING LOOP
# ============================================================
print("Training model...")

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for X, y in tqdm(train_loader, colour='green'):
        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += len(y)

    train_acc = correct / total

    # Validation
    model.eval()
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for Xv, yv in val_loader:
            preds = model(Xv).argmax(dim=1)
            val_correct += (preds == yv).sum().item()
            val_total += len(yv)

    val_acc = val_correct / val_total

    print(
        f"Epoch {epoch+1}/{EPOCHS} "
        f"Loss = {total_loss:.3f}  "
        f"Train Acc = {train_acc:.3f}  "
        f"Val Acc = {val_acc:.3f}"
    )

# ============================================================
#                 FINAL TEST ACCURACY
# ============================================================
print("Evaluating test accuracy...")

model.eval()
test_correct = 0
test_total = 0

with torch.no_grad():
    for Xt, yt in test_loader:
        preds = model(Xt).argmax(dim=1)
        test_correct += (preds == yt).sum().item()
        test_total += len(yt)

print(f"FINAL TEST ACCURACY = {test_correct / test_total:.3f}")

# ============================================================
#                 SAVE MODEL
# ============================================================
torch.save(model.state_dict(), "rnn_player_classifier.pt")
print("Model saved as rnn_player_classifier.pt")


Loading CSV...
Games left after filtering: 24348
label
Carlsen, Magnus        7726
Caruana, Fabiano       6418
Nepomniachtchi, Ian    5157
Firouzja, Alireza      5047
Name: count, dtype: int64
Building vocabulary...
Training model...


100%|[32m██████████[0m| 67/67 [00:12<00:00,  5.56it/s]


Epoch 1/30 Loss = 92.551  Train Acc = 0.300  Val Acc = 0.315


100%|[32m██████████[0m| 67/67 [00:11<00:00,  5.72it/s]


Epoch 2/30 Loss = 90.312  Train Acc = 0.349  Val Acc = 0.320


100%|[32m██████████[0m| 67/67 [00:11<00:00,  5.80it/s]


Epoch 3/30 Loss = 88.059  Train Acc = 0.379  Val Acc = 0.310


100%|[32m██████████[0m| 67/67 [00:14<00:00,  4.58it/s]


Epoch 4/30 Loss = 83.969  Train Acc = 0.431  Val Acc = 0.325


100%|[32m██████████[0m| 67/67 [00:15<00:00,  4.40it/s]


Epoch 5/30 Loss = 77.429  Train Acc = 0.494  Val Acc = 0.334


100%|[32m██████████[0m| 67/67 [00:12<00:00,  5.37it/s]


Epoch 6/30 Loss = 67.648  Train Acc = 0.580  Val Acc = 0.356


100%|[32m██████████[0m| 67/67 [00:13<00:00,  5.05it/s]


Epoch 7/30 Loss = 54.240  Train Acc = 0.680  Val Acc = 0.346


100%|[32m██████████[0m| 67/67 [00:14<00:00,  4.58it/s]


Epoch 8/30 Loss = 41.557  Train Acc = 0.762  Val Acc = 0.364


100%|[32m██████████[0m| 67/67 [00:13<00:00,  4.96it/s]


Epoch 9/30 Loss = 29.298  Train Acc = 0.840  Val Acc = 0.358


100%|[32m██████████[0m| 67/67 [00:33<00:00,  2.03it/s]


Epoch 10/30 Loss = 20.574  Train Acc = 0.891  Val Acc = 0.379


100%|[32m██████████[0m| 67/67 [00:17<00:00,  3.87it/s]


Epoch 11/30 Loss = 15.194  Train Acc = 0.921  Val Acc = 0.378


100%|[32m██████████[0m| 67/67 [00:17<00:00,  3.93it/s]


Epoch 12/30 Loss = 11.097  Train Acc = 0.944  Val Acc = 0.379


100%|[32m██████████[0m| 67/67 [00:16<00:00,  4.01it/s]


Epoch 13/30 Loss = 9.214  Train Acc = 0.953  Val Acc = 0.383


100%|[32m██████████[0m| 67/67 [00:23<00:00,  2.81it/s]


Epoch 14/30 Loss = 8.237  Train Acc = 0.959  Val Acc = 0.381


100%|[32m██████████[0m| 67/67 [00:27<00:00,  2.41it/s]


Epoch 15/30 Loss = 7.001  Train Acc = 0.965  Val Acc = 0.377


100%|[32m██████████[0m| 67/67 [00:20<00:00,  3.24it/s]


Epoch 16/30 Loss = 5.812  Train Acc = 0.972  Val Acc = 0.385


100%|[32m██████████[0m| 67/67 [00:17<00:00,  3.81it/s]


Epoch 17/30 Loss = 5.295  Train Acc = 0.976  Val Acc = 0.375


100%|[32m██████████[0m| 67/67 [00:19<00:00,  3.49it/s]


Epoch 18/30 Loss = 5.290  Train Acc = 0.976  Val Acc = 0.378


100%|[32m██████████[0m| 67/67 [00:19<00:00,  3.42it/s]


Epoch 19/30 Loss = 5.162  Train Acc = 0.976  Val Acc = 0.374


100%|[32m██████████[0m| 67/67 [00:19<00:00,  3.41it/s]


Epoch 20/30 Loss = 5.110  Train Acc = 0.974  Val Acc = 0.378


100%|[32m██████████[0m| 67/67 [00:21<00:00,  3.09it/s]


Epoch 21/30 Loss = 4.931  Train Acc = 0.975  Val Acc = 0.378


100%|[32m██████████[0m| 67/67 [00:24<00:00,  2.69it/s]


Epoch 22/30 Loss = 5.315  Train Acc = 0.974  Val Acc = 0.382


100%|[32m██████████[0m| 67/67 [00:19<00:00,  3.49it/s]


Epoch 23/30 Loss = 4.419  Train Acc = 0.979  Val Acc = 0.374


100%|[32m██████████[0m| 67/67 [00:18<00:00,  3.53it/s]


Epoch 24/30 Loss = 3.617  Train Acc = 0.982  Val Acc = 0.374


100%|[32m██████████[0m| 67/67 [00:39<00:00,  1.70it/s]


Epoch 25/30 Loss = 3.799  Train Acc = 0.983  Val Acc = 0.377


100%|[32m██████████[0m| 67/67 [00:28<00:00,  2.39it/s]


Epoch 26/30 Loss = 4.952  Train Acc = 0.975  Val Acc = 0.367


100%|[32m██████████[0m| 67/67 [00:25<00:00,  2.62it/s]


Epoch 27/30 Loss = 4.818  Train Acc = 0.976  Val Acc = 0.376


100%|[32m██████████[0m| 67/67 [00:24<00:00,  2.77it/s]


Epoch 28/30 Loss = 3.969  Train Acc = 0.981  Val Acc = 0.382


100%|[32m██████████[0m| 67/67 [00:20<00:00,  3.32it/s]


Epoch 29/30 Loss = 4.203  Train Acc = 0.980  Val Acc = 0.382


100%|[32m██████████[0m| 67/67 [00:16<00:00,  4.05it/s]


Epoch 30/30 Loss = 3.235  Train Acc = 0.984  Val Acc = 0.386
Evaluating test accuracy...
FINAL TEST ACCURACY = 0.400
Model saved as rnn_player_classifier.pt
