# Data Reading Notebook

This notebook showcases how the data is read from the pgn file and trnasfrmed into a csv file suitable for being fed into a model or analyzed in later sections.

## Imports

In [1]:
from utils.utils import *
import warnings

In [2]:
warnings.filterwarnings("ignore")

## Data Reading

In [3]:
train_data_path = "../Processed_Data\\NLP_Train.csv.gz"
val_data_path = "../Processed_Data\\NLP_Val.csv.gz"
test_data_path = "../Processed_Data\\NLP_Test.csv.gz"
stockfish_path = "../stockfish/stockfish-windows-x86-64-avx2.exe"

In [4]:
train_pgn_path = "../Data\\Train\\lichess_db_standard_rated_2013-08.pgn.zst"
save_data(pgn_file_path=train_pgn_path,
          save_file_path=train_data_path, max_num_games=20,
          stockfish_path=stockfish_path, shuffle=True)

Num processed games in a file = 20


In [5]:
test_pgn_path = "../Data\\Test\\lichess_db_standard_rated_2013-02.pgn.zst"
save_data(pgn_file_path=test_pgn_path,
          save_file_path=test_data_path, max_num_games=10,
          stockfish_path=stockfish_path, shuffle=True)

Num processed games in a file = 10


In [6]:
val_pgn_path = "../Data\\Val\\lichess_db_standard_rated_2013-01.pgn.zst"
save_data(pgn_file_path=val_pgn_path,
          save_file_path=val_data_path, max_num_games=10,
          stockfish_path=stockfish_path, shuffle=True)

Num processed games in a file = 10


In [7]:
train_data = pd.read_csv(train_data_path, compression='gzip')
val_data = pd.read_csv(val_data_path, compression='gzip')
test_data = pd.read_csv(test_data_path, compression='gzip')

## Data Showcase

In [8]:
train_data.head()

Unnamed: 0,game_number,move_number,board,move,legal,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,...,en_passant,halfmove_clock,fullmove_number,prev_board,prev_piece_placement,prev_active_color,prev_castling_availability,prev_en_passant,prev_halfmove_clock,prev_fullmove_number
0,15,11,r1bqk1nr/pppp1ppp/2n5/8/1bB1P3/2N2N2/PP3PPP/R1...,f1c4,True,-37.0,-37.0,-29.0,-25.0,-11.0,...,-,2,6,r1bqk1nr/pppp1ppp/2n5/8/1b2P3/2N2N2/PP3PPP/R1B...,r1bqk1nr/pppp1ppp/2n5/8/1b2P3/2N2N2/PP3PPP/R1B...,w,KQkq,-,1,6
1,1,11,r1bqkb1r/pp3ppp/2n1pn2/2pP4/8/P1NP3P/1PP2PP1/R...,e4d5,True,-64.0,-70.0,-62.0,-19.0,-25.0,...,-,0,6,r1bqkb1r/pp3ppp/2n1pn2/2pp4/4P3/P1NP3P/1PP2PP1...,r1bqkb1r/pp3ppp/2n1pn2/2pp4/4P3/P1NP3P/1PP2PP1...,w,KQkq,-,0,6
2,18,13,r1bqk2r/ppp2ppp/1Qn2n2/3p4/1bP5/2N5/PP1BPPPP/R...,d1b6,False,,,,,,...,-,1,7,r1bqk2r/ppp2ppp/2n2n2/3p4/1bP5/2N5/PP1BPPPP/R2...,r1bqk2r/ppp2ppp/2n2n2/3p4/1bP5/2N5/PP1BPPPP/R2...,w,KQkq,-,0,7
3,8,25,r4Pk1/1ppq1pp1/p1n2n1p/3p1p2/3P4/P1N1PN2/1PP2P...,h4f8,False,,,,,,...,-,0,13,r4rk1/1ppq1pp1/p1n2n1p/3p1p2/3P3P/P1N1PN2/1PP2...,r4rk1/1ppq1pp1/p1n2n1p/3p1p2/3P3P/P1N1PN2/1PP2...,w,KQ,-,0,13
4,0,29,1Pk1r2r/p2nq1b1/1ppp1n1p/4p1p1/4P3/P2P1QBP/BPP...,f7b8,False,,,,,,...,-,0,15,2k1r2r/p2nqpb1/1ppp1n1p/4p1p1/4P3/P2P1QBP/BPP2...,2k1r2r/p2nqpb1/1ppp1n1p/4p1p1/4P3/P2P1QBP/BPP2...,w,-,-,2,15


In [9]:
val_data.head()

Unnamed: 0,game_number,move_number,board,move,legal,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,...,en_passant,halfmove_clock,fullmove_number,prev_board,prev_piece_placement,prev_active_color,prev_castling_availability,prev_en_passant,prev_halfmove_clock,prev_fullmove_number
0,4,37,r4b1r/pp1bnkp1/1q2B3/2ppP2p/5P2/2PP1K1P/PP1N4/...,g4e6,True,-678.0,-747.0,-727.0,-73.0,-156.0,...,-,0,19,r4b1r/pp1bnkp1/1q2p3/2ppP2p/5PB1/2PP1K1P/PP1N4...,r4b1r/pp1bnkp1/1q2p3/2ppP2p/5PB1/2PP1K1P/PP1N4...,w,-,-,0,19
1,5,54,Q3k1r1/2p2p1p/3pp2p/2r3p1/8/6P1/P2R1P2/5RK1 w ...,h2h7,False,,,,,,...,-,0,28,Q3k1r1/2p2p2/3pp2p/2r3p1/8/6P1/P2R1P1P/5RK1 b ...,Q3k1r1/2p2p2/3pp2p/2r3p1/8/6P1/P2R1P1P/5RK1,b,-,-,2,27
2,7,44,1r2k2r/1p2np2/pNp3q1/2Pnp1pp/1P6/P2PQ2P/4BPP1/...,f6d5,True,-235.0,-235.0,-244.0,-59.0,-70.0,...,-,0,23,1r2k2r/1p2np2/pNp2nq1/2PPp1pp/1P6/P2PQ2P/4BPP1...,1r2k2r/1p2np2/pNp2nq1/2PPp1pp/1P6/P2PQ2P/4BPP1...,b,k,-,0,22
3,6,11,r1bq1rk1/pppp1p1p/2P2n2/2b1p3/2B1P3/2PP1N2/PP3...,g7c6,False,,,,,,...,-,0,6,r1bq1rk1/pppp1ppp/2n2n2/2b1p3/2B1P3/2PP1N2/PP3...,r1bq1rk1/pppp1ppp/2n2n2/2b1p3/2B1P3/2PP1N2/PP3...,w,KQ,-,1,6
4,9,76,8/5pk1/p1Q3b1/1p6/1q6/8/5PPK/8 w - - 0 39,a3b4,True,507.0,507.0,504.0,27.0,30.0,...,-,0,39,8/5pk1/p1Q3b1/1p6/1P6/q7/5PPK/8 b - - 0 38,8/5pk1/p1Q3b1/1p6/1P6/q7/5PPK/8,b,-,-,0,38


In [10]:
test_data.head()

Unnamed: 0,game_number,move_number,board,move,legal,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,...,en_passant,halfmove_clock,fullmove_number,prev_board,prev_piece_placement,prev_active_color,prev_castling_availability,prev_en_passant,prev_halfmove_clock,prev_fullmove_number
0,4,11,rnb1k1nr/ppp1ppbp/4q1p1/8/3P4/2N2N2/PPP1BPPP/R...,g1f3,True,117.0,119.0,129.0,-15.0,-14.0,...,-,2,6,rnb1k1nr/ppp1ppbp/4q1p1/8/3P4/2N5/PPP1BPPP/R1B...,rnb1k1nr/ppp1ppbp/4q1p1/8/3P4/2N5/PPP1BPPP/R1B...,w,KQkq,-,1,6
1,1,3,rnbqkbnr/pppp1ppp/8/4p3/4P3/8/PP1P1PPP/RNBQKBN...,c2f2,False,,,,,,...,-,0,2,rnbqkbnr/pppp1ppp/8/4p3/4P3/8/PPPP1PPP/RNBQKBN...,rnbqkbnr/pppp1ppp/8/4p3/4P3/8/PPPP1PPP/RNBQKBNR,w,KQkq,-,0,2
2,3,26,rnnq1rk1/1b3p1p/pp1p2p1/2p5/2P1PP2/N1b1B3/PP1Q...,g7c3,True,-115.0,-115.0,-115.0,-19.0,-19.0,...,-,0,14,rnnq1rk1/1b3pbp/pp1p2p1/2p5/2P1PP2/N1N1B3/PP1Q...,rnnq1rk1/1b3pbp/pp1p2p1/2p5/2P1PP2/N1N1B3/PP1Q...,b,-,-,1,13
3,7,33,r6r/1p2b1pp/p4nk1/n3q3/2Q2N2/N1P1B3/PP3PPP/R3K...,e6f4,True,355.0,355.0,402.0,-32.0,-55.0,...,-,4,17,r6r/1p2b1pp/p3Nnk1/n3q3/2Q5/N1P1B3/PP3PPP/R3K2...,r6r/1p2b1pp/p3Nnk1/n3q3/2Q5/N1P1B3/PP3PPP/R3K2R,w,KQ,-,3,17
4,7,114,8/8/K5pk/8/8/1P5p/8/8 w - - 0 58,h4h3,True,529.0,529.0,573.0,-21.0,-21.0,...,-,0,58,8/8/K5pk/8/7p/1P6/8/8 b - - 0 57,8/8/K5pk/8/7p/1P6/8/8,b,-,-,0,57


In [11]:
train_data.describe()

Unnamed: 0,game_number,move_number,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,move_quality_10,prev_ELO,current_ELO,halfmove_clock,fullmove_number,prev_halfmove_clock,prev_fullmove_number
count,2478.0,2478.0,1263.0,1263.0,1263.0,1222.0,1222.0,1222.0,1239.0,2478.0,2478.0,2478.0,2478.0,2478.0
mean,9.65456,45.759483,-22.200317,-22.552652,-20.001584,-53.56874,-53.99018,-49.337152,1630.016949,1628.553672,1.232446,23.627119,1.828894,23.132365
std,6.055866,39.042789,385.928666,391.287489,410.929608,109.319314,109.939787,107.94926,240.84388,241.692168,2.836986,19.524789,3.288537,19.521202
min,0.0,1.0,-1507.0,-1507.0,-1616.0,-889.0,-889.0,-831.0,1124.0,1124.0,0.0,1.0,0.0,1.0
25%,4.0,16.0,-179.0,-182.0,-193.5,-75.75,-70.75,-63.0,1483.0,1463.0,0.0,9.0,0.0,8.0
50%,10.0,34.0,-5.0,-7.0,-8.0,-23.5,-21.0,-15.0,1605.0,1605.0,0.0,18.0,1.0,17.0
75%,15.0,64.0,123.5,115.5,131.0,-1.0,0.0,0.0,1752.0,1752.0,1.0,33.0,2.0,32.0
max,19.0,179.0,1279.0,1279.0,1507.0,383.0,383.0,611.0,2223.0,2223.0,26.0,90.0,26.0,90.0


In [12]:
val_data.describe()

Unnamed: 0,game_number,move_number,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,move_quality_10,prev_ELO,current_ELO,halfmove_clock,fullmove_number,prev_halfmove_clock,prev_fullmove_number
count,1080.0,1080.0,548.0,548.0,548.0,528.0,528.0,528.0,540.0,1080.0,1080.0,1080.0,1080.0,1080.0
mean,5.305556,33.009259,-20.987226,-23.498175,-23.202555,-52.589015,-57.753788,-57.204545,1616.746296,1616.475926,0.791667,17.251852,1.211111,16.757407
std,2.756939,23.221501,336.272646,341.360893,349.72439,107.505825,124.402877,123.414452,213.822782,215.089596,1.427301,11.617295,1.646145,11.609591
min,0.0,1.0,-899.0,-899.0,-899.0,-1168.0,-1168.0,-1164.0,1169.0,1169.0,0.0,1.0,0.0,1.0
25%,3.0,14.0,-176.0,-181.75,-191.0,-66.0,-71.0,-61.0,1477.0,1477.0,0.0,8.0,0.0,7.0
50%,5.0,28.5,-14.5,-18.0,-21.5,-29.0,-25.5,-21.0,1541.0,1541.0,0.0,15.0,1.0,14.5
75%,8.0,48.25,121.75,124.25,123.0,-2.0,-1.0,-2.0,1765.0,1777.5,1.0,25.0,2.0,24.25
max,9.0,94.0,900.0,900.0,900.0,187.0,301.0,172.0,1973.0,1973.0,9.0,48.0,8.0,47.0


In [13]:
test_data.describe()

Unnamed: 0,game_number,move_number,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,move_quality_10,prev_ELO,current_ELO,halfmove_clock,fullmove_number,prev_halfmove_clock,prev_fullmove_number
count,934.0,934.0,471.0,471.0,471.0,451.0,451.0,451.0,467.0,934.0,934.0,934.0,934.0,934.0
mean,5.304069,34.010707,-21.772824,-24.874735,-23.755839,-52.215078,-58.594235,-56.875831,1605.672377,1603.554604,0.794433,17.751606,1.162741,17.259101
std,2.804384,27.016447,241.032578,248.275463,248.297149,115.819861,115.874595,114.302008,182.828157,184.531825,1.449593,13.514883,1.636685,13.506193
min,0.0,1.0,-899.0,-1011.0,-985.0,-867.0,-869.0,-927.0,1212.0,1212.0,0.0,1.0,0.0,1.0
25%,3.0,13.0,-128.0,-125.5,-126.0,-65.0,-77.0,-73.5,1481.0,1481.0,0.0,7.0,0.0,7.0
50%,6.0,28.0,-7.0,-7.0,-8.0,-24.0,-26.0,-22.0,1631.0,1631.0,0.0,15.0,0.0,14.0
75%,7.0,47.0,85.5,80.5,78.0,0.0,-0.5,-2.0,1796.0,1796.0,1.0,24.0,2.0,24.0
max,9.0,118.0,900.0,969.0,1101.0,175.0,147.0,116.0,1876.0,1876.0,10.0,60.0,10.0,59.0


In [14]:
train_data['real'].value_counts()

real
True     1239
False    1239
Name: count, dtype: int64

In [15]:
val_data['real'].value_counts()

real
True     540
False    540
Name: count, dtype: int64

In [16]:
test_data['real'].value_counts()

real
True     467
False    467
Name: count, dtype: int64

In [17]:
train_data['legal'].value_counts()

legal
True     1263
False    1215
Name: count, dtype: int64

In [18]:
val_data['legal'].value_counts()

legal
True     548
False    532
Name: count, dtype: int64

In [19]:
test_data['legal'].value_counts()

legal
True     471
False    463
Name: count, dtype: int64

In [20]:
print(len(train_data))

2478
