In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append("..")

from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification
from utils.utils import load_sql_to_df, save_to_sql
import utils.utils as utils
import pytorch_lightning as pl
from models.lightning import LitHuggingfaceClassifier
import pandas as pd
import chess
import random


In [3]:
db_file = "../../data/chess_moves_comments_nags.db"
move_quality_db_file = "../../data/chess_moves_quality.db"

In [4]:
all_moves = load_sql_to_df("SELECT * from all_moves", db_file)
# all_moves.drop("index", axis=1, inplace=True

In [5]:
move_columns = ["fen", "move", "sentiment"]
tables = ["nags_without_comments", "labeled_moves_with_comments", "predicted_moves_with_comments"]

dfs = [load_sql_to_df(f"SELECT * from {table}", db_file)[move_columns] for table in tables]

In [32]:
moves_from_books = load_sql_to_df("SELECT * from moves_from_books", "../../data/chess.db")
moves_from_books.drop("index", axis=1, inplace=True)

In [34]:
moves_from_books.rename(columns={"position": "fen"}, inplace=True)

In [38]:
save_to_sql(moves_from_books, move_quality_db_file, "moves_from_books")

In [5]:
move_quality = pd.concat(dfs, ignore_index=True)

In [22]:
save_to_sql(move_quality,move_quality_db_file,  "comments_and_nags", if_exists="replace")

In [11]:
def random_move(board: chess.Board | str):
    if isinstance(board, str):
        board = chess.Board(board)

    legal_moves = list(board.legal_moves)
    move = random.choice(legal_moves)
    return move.uci()

def random_move_tuple(fen):
    rand_move = random_move(fen)
    return (fen, rand_move, 0)


def is_ok(row):
    try:
        board = chess.Board(row.fen)
        board.push_uci(row.move)
        return True
    except:
        return False

In [None]:
sentimate_df = pd.concat([move_quality, moves_from_books, random_moves], ignore_index=True)

In [13]:
random_moves = load_sql_to_df("SELECT * from random_moves", move_quality_db_file)
commented_moves = load_sql_to_df("SELECT * from moves_with_comments", db_file)

Unnamed: 0,fen,move,comment,nags,sentiment,color_comment
0,r1bq1k1r/1pp3p1/p4pn1/2bPp2p/B3P1nP/2N3PN/PP1B...,c5d6,,[1],1,black [SEP] 
1,r2qkb1r/1p1b1ppp/p1nppn2/6B1/B3P3/2PQ1N2/PP3PP...,f8e7,N,[5],-1,black [SEP] N
2,6k1/5p2/4p1p1/p5N1/1p1q3p/1P5P/P5P1/5QK1 w - -...,g1h1,),"[1, 18]",1,white [SEP] )
3,r1b1k2r/ppp1nppp/5q2/2bpn3/3NP3/2P1B3/PP2BPPP/...,e8g8,N,[5],-1,black [SEP] N
4,3qr1k1/1br1bpp1/p4n1p/1p1pNR2/3P3B/P1N1P3/1P2Q...,b7c8,#,[],-1,black [SEP] #
...,...,...,...,...,...,...
5111775,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1,white [SEP] Este es la Estructura Lazo de Marï...
5111776,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1,white [SEP] Este es la Estructura Lazo de Marï...
5111777,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1,white [SEP] Este es la Estructura Lazo de Marï...
5111778,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1,white [SEP] Este es la Estructura Lazo de Marï...


In [41]:
sentimate_df = pd.concat([move_quality, moves_from_books, random_moves], ignore_index=True)

In [43]:
save_to_sql(sentimate_df, move_quality_db_file, "sentimate_v1", if_exists="replace")

In [45]:
sentimate_df[sentimate_df.sentiment == 1]

Unnamed: 0,fen,move,sentiment
0,b2r3r/4Rp1p/p4np1/1p1q4/kP6/P1Q2PPB/2P4P/1K6 w...,e7a7,1
1,7r/3R1p1p/6p1/1p6/2q5/5PP1/1Q1r3P/1K1k1B2 b - ...,d2d7,1
2,r3r3/3n1p1p/pR4p1/1p1q4/kP6/P1Q2PP1/2P4P/1K3B2...,b6d6,1
4,b2r3r/R4p1p/p2q2p1/1p1n4/kP1Q4/P4PPB/2P4P/1K6 ...,a7a6,1
5,3r3r/4Rp1p/pk1qBQp1/Np1b4/8/P4PP1/1PP4P/1K6 w ...,b2b4,1
...,...,...,...
7351875,8/1kp2Q2/1pb5/p1P5/4r3/1PP5/3K4/8 b - - 0 56,b6b5,1
7351876,8/1kp1PQ2/1pb5/p7/2P5/1P6/1KP1rp2/8 b - - 2 47,f2f1q,1
7351877,8/1kp1PQ2/1pb5/p7/2P5/1P6/1KP1r3/5q2 w - - 0 48,f7f1,1
7351878,8/1kp1P3/1pb5/p7/2P5/1P6/1KP1r3/5Q2 b - - 0 48,e2e7,1


In [46]:
sentimate_dataset = Dataset.from_pandas(sentimate_df)

In [16]:
unlabeled_moves_with_comments = commented_moves[commented_moves.sentiment == -1]
unlabeled_moves_with_comments

Unnamed: 0,fen,move,comment,nags,sentiment,color_comment
1,r2qkb1r/1p1b1ppp/p1nppn2/6B1/B3P3/2PQ1N2/PP3PP...,f8e7,N,[5],-1,black [SEP] N
3,r1b1k2r/ppp1nppp/5q2/2bpn3/3NP3/2P1B3/PP2BPPP/...,e8g8,N,[5],-1,black [SEP] N
4,3qr1k1/1br1bpp1/p4n1p/1p1pNR2/3P3B/P1N1P3/1P2Q...,b7c8,#,[],-1,black [SEP] #
7,r1b1rb1k/pppp1ppB/2n2q1p/4p3/2P5/P3PN2/1PQP1PP...,h7e4,,[14],-1,white [SEP] 
8,rnb4r/pp1pk1bp/1qpN1pp1/8/3P4/5N2/PPP2PPP/1R1Q...,d6c4,@,[16],-1,white [SEP] @
...,...,...,...,...,...,...
5111774,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1,white [SEP] Este es la Estructura Lazo de Marï...
5111775,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1,white [SEP] Este es la Estructura Lazo de Marï...
5111776,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1,white [SEP] Este es la Estructura Lazo de Marï...
5111777,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1,white [SEP] Este es la Estructura Lazo de Marï...


In [15]:
save_to_sql(unlabeled_moves_with_comments, db_file, "unlabeled_moves_with_comments", if_exists="replace")

In [49]:
commented_moves

Unnamed: 0,fen,move,comment,nags,sentiment
1859254,r1bq1k1r/1pp3p1/p4pn1/2bPp2p/B3P1nP/2N3PN/PP1B...,c5d6,,[1],1
1859255,r2qkb1r/1p1b1ppp/p1nppn2/6B1/B3P3/2PQ1N2/PP3PP...,f8e7,N,[5],-1
1859256,6k1/5p2/4p1p1/p5N1/1p1q3p/1P5P/P5P1/5QK1 w - -...,g1h1,),"[1, 18]",1
1859257,r1b1k2r/ppp1nppp/5q2/2bpn3/3NP3/2P1B3/PP2BPPP/...,e8g8,N,[5],-1
1859258,3qr1k1/1br1bpp1/p4n1p/1p1pNR2/3P3B/P1N1P3/1P2Q...,b7c8,#,[],-1
...,...,...,...,...,...
9059210,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1
9059211,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1
9059212,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1
9059213,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1


In [53]:
commented_moves

Unnamed: 0,fen,move,comment,nags,sentiment
1859254,r1bq1k1r/1pp3p1/p4pn1/2bPp2p/B3P1nP/2N3PN/PP1B...,c5d6,,[1],1
1859255,r2qkb1r/1p1b1ppp/p1nppn2/6B1/B3P3/2PQ1N2/PP3PP...,f8e7,N,[5],-1
1859256,6k1/5p2/4p1p1/p5N1/1p1q3p/1P5P/P5P1/5QK1 w - -...,g1h1,),"[1, 18]",1
1859257,r1b1k2r/ppp1nppp/5q2/2bpn3/3NP3/2P1B3/PP2BPPP/...,e8g8,N,[5],-1
1859258,3qr1k1/1br1bpp1/p4n1p/1p1pNR2/3P3B/P1N1P3/1P2Q...,b7c8,#,[],-1
...,...,...,...,...,...
9059210,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1
9059211,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1
9059212,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1
9059213,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1


In [None]:
commented_moves[commented_moves.comment.str.match(r"^(([[][%](eval|cal|clk|timestamp|mdl|csl|emt)[^]]+[]]\s*)|(\d+\s*))+$")]

In [47]:
len(commented_moves)

5142505

In [9]:
save_to_sql(commented_moves, db_file, "moves_with_comments", if_exists="replace")

In [61]:
commented_moves

Unnamed: 0,fen,move,comment,nags,sentiment
1859254,r1bq1k1r/1pp3p1/p4pn1/2bPp2p/B3P1nP/2N3PN/PP1B...,c5d6,,[1],1
1859255,r2qkb1r/1p1b1ppp/p1nppn2/6B1/B3P3/2PQ1N2/PP3PP...,f8e7,N,[5],-1
1859256,6k1/5p2/4p1p1/p5N1/1p1q3p/1P5P/P5P1/5QK1 w - -...,g1h1,),"[1, 18]",1
1859257,r1b1k2r/ppp1nppp/5q2/2bpn3/3NP3/2P1B3/PP2BPPP/...,e8g8,N,[5],-1
1859258,3qr1k1/1br1bpp1/p4n1p/1p1pNR2/3P3B/P1N1P3/1P2Q...,b7c8,#,[],-1
...,...,...,...,...,...
9059210,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1
9059211,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1
9059212,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1
9059213,r1bqkbnr/pp1ppp1p/2n3p1/8/3NP3/8/PPP2PPP/RNBQK...,c2c4,Este es la Estructura Lazo de Marï¿½czy -o ten...,[],-1


In [None]:
all_moves.loc[all_moves.comment.str.match("^[?]+"), "sentiment"] = 0
all_moves.loc[all_moves.comment.str.match("^[!]+[ ]"), "sentiment"] = 1

In [60]:
commented_moves.loc[commented_moves.comment == "!!", "sentiment"] = 1
commented_moves.loc[commented_moves.comment == "!", "sentiment"] = 1
commented_moves.loc[commented_moves.comment == "??", "sentiment"] = 0
commented_moves.loc[commented_moves.comment == "?", "sentiment"] = 0

commented_moves = commented_moves[~commented_moves.comment.str.match("^([[][%]eval[^]]+[]][ ]*)+$")]
commented_moves = commented_moves[~((commented_moves.comment.str.match("[dD][iI][aA][gG][rR][aA][mM]")) & (commented_moves.comment.str.len() < 16))]
commented_moves = commented_moves[~((commented_moves.comment.str.match(".*[dD][iI][aA][gG][rR][aA][mM].*")) & (commented_moves.comment.str.len() < 16))]
commented_moves = commented_moves[~commented_moves.comment.str.match(r"^(([[][%](eval|cal|clk|timestamp|mdl|csl|emt)[^]]+[]])|((pgn)?[Dd]iagram)\s*)+$")]
commented_moves = commented_moves[~commented_moves.comment.str.match("[+-]?M?\d+([.]\d+)?/(\d+)? (\d+([.]\d+s)*)*$")]
commented_moves = commented_moves[~commented_moves.comment.str.match(r"^[+-]?\d+([.]\d+)?/\d+\s+\d+s$")]

color_comment = commented_moves.apply(utils.add_color_to_comment, axis=1)
commented_moves["color_comment"] = color_comment
commented_moves = commented_moves[~((commented_moves.comment.str.match(r".*><.*")) & (commented_moves.comment.str.len() < 16))]

In [105]:
datasets = utils.train_val_test_balanced_datasets(commented_moves, train_size_per_class=200_000, val_test_size_per_class=5_000)

In [None]:
datasets.save_to_disk("../../data/datasets/comments_with_color_200k_5k/")