# Data Reading Notebook

This notebook showcases how the data is read from the pgn file and trnasfrmed into a csv file suitable for being fed into a model or analyzed in later sections.

## Imports

In [1]:
from utils.utils import *
import warnings

In [2]:
warnings.filterwarnings("ignore")

## Data Reading

In [3]:
train_data_path = "../Processed_Data\\NLP_Train.csv.gz"
val_data_path = "../Processed_Data\\NLP_Val.csv.gz"
test_data_path = "../Processed_Data\\NLP_Test.csv.gz"
stockfish_path = "../stockfish/stockfish-windows-x86-64-avx2.exe"

In [4]:
train_pgn_path = "../Data\\Train\\lichess_db_standard_rated_2013-08.pgn.zst"
save_data(pgn_file_path=train_pgn_path,
          save_file_path=train_data_path, max_num_games=20000,
          stockfish_path=stockfish_path, shuffle=True)

Num processed games in a file = 20000


In [5]:
test_pgn_path = "../Data\\Test\\lichess_db_standard_rated_2013-02.pgn.zst"
save_data(pgn_file_path=test_pgn_path,
          save_file_path=test_data_path, max_num_games=1000,
          stockfish_path=stockfish_path, shuffle=True)

Num processed games in a file = 1000


In [6]:
val_pgn_path = "../Data\\Val\\lichess_db_standard_rated_2013-01.pgn.zst"
save_data(pgn_file_path=val_pgn_path,
          save_file_path=val_data_path, max_num_games=1000,
          stockfish_path=stockfish_path, shuffle=True)

Num processed games in a file = 1000


In [7]:
train_data = pd.read_csv(train_data_path, compression='gzip')
val_data = pd.read_csv(val_data_path, compression='gzip')
test_data = pd.read_csv(test_data_path, compression='gzip')

## Data Showcase

In [8]:
train_data.head()

Unnamed: 0,game_number,move_number,board,move,legal,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,...,en_passant,halfmove_clock,fullmove_number,prev_board,prev_piece_placement,prev_active_color,prev_castling_availability,prev_en_passant,prev_halfmove_clock,prev_fullmove_number
0,8610,74,3rQ2k/p4Q1p/1p6/8/3R3p/8/PPP3PP/5RK1 w - - 1 38,g7d8,False,,,,,,...,-,1,38,4Q2k/p4Qrp/1p6/8/3R3p/8/PPP3PP/5RK1 b - - 0 37,4Q2k/p4Qrp/1p6/8/3R3p/8/PPP3PP/5RK1,b,-,-,0,37
1,14299,76,8/6p1/5p1p/1p1p1P2/1P1K1kP1/P6P/5p2/8 w - - 0 39,f3f2,False,,,,,,...,-,0,39,8/6p1/5p1p/1p1p1P2/1P1K1kP1/P4P1P/8/8 b - - 0 38,8/6p1/5p1p/1p1p1P2/1P1K1kP1/P4P1P/8/8,b,-,-,0,38
2,11736,55,4rb2/5kp1/1p1p1pp1/2pP4/P1P3R1/1PB3PP/5PK1/6P1...,a6g1,False,,,,,,...,-,0,28,4rb2/5kp1/pp1p1pp1/2pP4/P1P3R1/1PB3PP/5PK1/8 w...,4rb2/5kp1/pp1p1pp1/2pP4/P1P3R1/1PB3PP/5PK1/8,w,-,-,0,28
3,12785,62,1r3k2/p4p2/2B2p2/P2Pp3/4PnR1/4KP1p/5P1P/8 w - ...,c8b8,True,57.0,57.0,79.0,15.0,15.0,...,-,2,32,2r2k2/p4p2/2B2p2/P2Pp3/4PnR1/4KP1p/5P1P/8 b - ...,2r2k2/p4p2/2B2p2/P2Pp3/4PnR1/4KP1p/5P1P/8,b,-,-,1,31
4,8175,66,6k1/6pp/p7/3bKp2/1p6/1P2R3/P7/6r1 w - - 0 34,e6f5,True,473.0,473.0,493.0,41.0,41.0,...,-,0,34,6k1/6pp/p3p3/3bKR2/1p6/1P2R3/P7/6r1 b - - 0 33,6k1/6pp/p3p3/3bKR2/1p6/1P2R3/P7/6r1,b,-,-,0,33


In [9]:
val_data.head()

Unnamed: 0,game_number,move_number,board,move,legal,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,...,en_passant,halfmove_clock,fullmove_number,prev_board,prev_piece_placement,prev_active_color,prev_castling_availability,prev_en_passant,prev_halfmove_clock,prev_fullmove_number
0,601,63,8/pnp2pk1/1p1q2p1/rP4Q1/4R3/6PP/5P2/4R1K1 b - ...,g2g3,True,227.0,164.0,273.0,-144.0,-207.0,...,-,0,32,8/pnp2pk1/1p1q2p1/rP4Q1/4R3/7P/5PP1/4R1K1 w - ...,8/pnp2pk1/1p1q2p1/rP4Q1/4R3/7P/5PP1/4R1K1,w,-,-,4,32
1,859,136,3Q4/4n3/5kp1/7p/6r1/8/5K2/8 w - - 24 69,f5e7,True,14.0,14.0,15.0,14.0,14.0,...,-,24,69,3Q4/8/5kp1/5n1p/6r1/8/5K2/8 b - - 23 68,3Q4/8/5kp1/5n1p/6r1/8/5K2/8,b,-,-,23,68
2,161,57,r3r1k1/pp4pb/7p/7P/1PP1p1P1/2K5/P7/8 b - - 1 29,d2c3,True,-714.0,-714.0,-1009.0,-64.0,-48.0,...,-,1,29,r3r1k1/pp4pb/7p/7P/1PP1p1P1/8/P2K4/8 w - - 0 29,r3r1k1/pp4pb/7p/7P/1PP1p1P1/8/P2K4/8,w,-,-,0,29
3,70,64,2rkr3/5p2/n1bB4/p2p3p/4nR1P/3K4/P7/8 w - - 6 33,h8e8,True,739.0,748.0,767.0,-35.0,-26.0,...,-,6,33,2rk3r/5p2/n1bB4/p2p3p/4nR1P/3K4/P7/8 b - - 5 32,2rk3r/5p2/n1bB4/p2p3p/4nR1P/3K4/P7/8,b,-,-,5,32
4,968,35,2rq1rk1/1b1nnpb1/1p2p1p1/pP1p3p/P2P4/2NBPP2/1B...,g4f2,True,-43.0,-44.0,-39.0,-6.0,-5.0,...,-,1,18,2rq1rk1/1b1nnpb1/1p2p1p1/pP1p3p/P2P2N1/2NBPP2/...,2rq1rk1/1b1nnpb1/1p2p1p1/pP1p3p/P2P2N1/2NBPP2/...,w,-,-,0,18


In [10]:
test_data.head()

Unnamed: 0,game_number,move_number,board,move,legal,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,...,en_passant,halfmove_clock,fullmove_number,prev_board,prev_piece_placement,prev_active_color,prev_castling_availability,prev_en_passant,prev_halfmove_clock,prev_fullmove_number
0,895,45,r5k1/5pbp/1p4P1/1N1Pp3/1pP1P1b1/1Q1B4/P5PP/5RK...,a5g6,False,,,,,,...,-,0,23,r5k1/5pbp/1p4p1/pN1Pp3/1pP1P1b1/1Q1B4/P5PP/5RK...,r5k1/5pbp/1p4p1/pN1Pp3/1pP1P1b1/1Q1B4/P5PP/5RK1,w,-,-,2,23
1,959,50,r5k1/p2Q1pp1/n7/4p2p/3b4/2KP4/PP2q2P/RN5R w - ...,e3d4,True,598.0,596.0,727.0,-147.0,-149.0,...,-,0,26,r5k1/p2Q1pp1/n7/4p2p/3P4/2KPb3/PP2q2P/RN5R b -...,r5k1/p2Q1pp1/n7/4p2p/3P4/2KPb3/PP2q2P/RN5R,b,-,-,3,25
2,465,34,2k1rb1r/pp1nqppp/b1p5/3pP3/1P3P2/2P3PP/P1QN2B1...,d8e8,True,-273.0,-273.0,-267.0,-22.0,-31.0,...,-,1,18,2kr1b1r/pp1nqppp/b1p5/3pP3/1P3P2/2P3PP/P1QN2B1...,2kr1b1r/pp1nqppp/b1p5/3pP3/1P3P2/2P3PP/P1QN2B1...,b,-,-,0,17
3,768,32,r2q1rk1/1p2bpp1/p1n1pnbp/8/3P3B/P1NQ1P2/1PB1N1...,h5g6,True,120.0,120.0,129.0,84.0,84.0,...,-,6,17,r2q1rk1/1p2bpp1/p1n1pn1p/7b/3P3B/P1NQ1P2/1PB1N...,r2q1rk1/1p2bpp1/p1n1pn1p/7b/3P3B/P1NQ1P2/1PB1N...,b,-,-,5,16
4,641,9,rnb1kbnr/ppp2ppp/3p1q2/8/2NPp3/4P3/PPP2PPP/RNB...,e5c4,True,-21.0,-21.0,-41.0,19.0,19.0,...,-,1,5,rnb1kbnr/ppp2ppp/3p1q2/4N3/3Pp3/4P3/PPP2PPP/RN...,rnb1kbnr/ppp2ppp/3p1q2/4N3/3Pp3/4P3/PPP2PPP/RN...,w,KQkq,-,0,5


In [11]:
train_data.describe()

Unnamed: 0,game_number,move_number,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,move_quality_10,prev_ELO,current_ELO,halfmove_clock,fullmove_number,prev_halfmove_clock,prev_fullmove_number
count,2650486.0,2650486.0,1353099.0,1353099.0,1353099.0,1312727.0,1312727.0,1312727.0,1325243.0,2650486.0,2650486.0,2650486.0,2650486.0,2650486.0
mean,10021.28,41.51673,-24.62001,-27.06882,-24.01864,-57.23654,-61.7716,-55.76219,1625.895,1625.306,1.118049,21.50641,1.666902,21.01032
std,5768.842,30.75528,344.099,349.398,357.6443,112.8228,124.4606,116.8503,220.2294,220.3231,2.836536,15.3822,3.290506,15.37714
min,0.0,1.0,-1925.0,-1925.0,-1753.0,-1798.0,-1995.0,-1991.0,0.0,0.0,0.0,1.0,0.0,1.0
25%,5003.0,17.0,-200.0,-214.0,-217.0,-80.0,-81.0,-67.0,1487.0,1486.0,0.0,9.0,0.0,9.0
50%,9956.0,36.0,-10.0,-13.0,-13.0,-30.0,-28.0,-21.0,1619.0,1618.0,0.0,19.0,1.0,18.0
75%,15027.0,59.0,133.0,132.0,141.0,-2.0,-1.0,0.0,1761.0,1761.0,1.0,30.0,2.0,30.0
max,19999.0,239.0,2216.0,1925.0,1747.0,1335.0,1331.0,852.0,2355.0,2355.0,101.0,120.0,100.0,120.0


In [12]:
val_data.describe()

Unnamed: 0,game_number,move_number,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,move_quality_10,prev_ELO,current_ELO,halfmove_clock,fullmove_number,prev_halfmove_clock,prev_fullmove_number
count,122338.0,122338.0,62464.0,62464.0,62464.0,60451.0,60451.0,60451.0,61169.0,122338.0,122338.0,122338.0,122338.0,122338.0
mean,499.390639,38.615017,-26.011543,-28.924501,-25.893827,-61.289722,-66.705993,-60.682007,1635.741094,1634.995733,1.003327,20.055322,1.488564,19.559695
std,290.979365,29.141178,368.861963,374.385423,382.058859,122.214401,134.069965,124.847395,206.008753,206.023952,2.11499,14.575389,2.417875,14.570077
min,0.0,1.0,-1550.0,-1766.0,-1390.0,-1798.0,-1798.0,-1798.0,0.0,0.0,0.0,1.0,0.0,1.0
25%,243.0,16.0,-220.0,-236.0,-237.0,-86.0,-88.0,-72.0,1504.0,1504.0,0.0,9.0,0.0,8.0
50%,507.0,33.0,-12.0,-15.0,-15.0,-31.0,-30.0,-23.0,1633.0,1632.0,0.0,17.0,1.0,17.0
75%,751.0,54.0,150.0,148.0,159.0,-2.0,-1.0,0.0,1807.0,1805.0,1.0,28.0,2.0,27.0
max,999.0,186.0,1576.0,1695.0,1393.0,902.0,991.0,621.0,2161.0,2161.0,35.0,94.0,34.0,93.0


In [13]:
test_data.describe()

Unnamed: 0,game_number,move_number,stockfish_2,stockfish_5,stockfish_10,move_quality_2,move_quality_5,move_quality_10,prev_ELO,current_ELO,halfmove_clock,fullmove_number,prev_halfmove_clock,prev_fullmove_number
count,130112.0,130112.0,66440.0,66440.0,66440.0,64418.0,64418.0,64418.0,65056.0,130112.0,130112.0,130112.0,130112.0,130112.0
mean,500.405604,40.707375,-25.778311,-28.094326,-25.872652,-59.164892,-63.293862,-58.938325,1604.964215,1604.38473,1.120442,21.10162,1.666072,20.605755
std,290.514289,30.535309,351.202442,356.595772,365.086907,116.908464,127.979333,122.810274,205.206152,205.051721,3.056542,15.272213,3.6003,15.267189
min,0.0,1.0,-1415.0,-1428.0,-1390.0,-1798.0,-1798.0,-1798.0,0.0,0.0,0.0,1.0,0.0,1.0
25%,251.0,17.0,-207.0,-221.0,-225.0,-83.0,-83.0,-70.0,1478.0,1478.0,0.0,9.0,0.0,9.0
50%,499.0,35.0,-11.0,-14.0,-15.0,-30.0,-29.0,-22.0,1610.0,1610.0,0.0,18.0,1.0,18.0
75%,752.0,57.0,137.25,137.0,146.0,-2.0,-1.0,0.0,1739.0,1739.0,1.0,29.0,2.0,29.0
max,999.0,202.0,1588.0,1433.0,1533.0,1104.0,1221.0,1422.0,2137.0,2137.0,86.0,102.0,85.0,101.0


In [14]:
train_data['real'].value_counts()

real
False    1325243
True     1325243
Name: count, dtype: int64

In [15]:
val_data['real'].value_counts()

real
True     61169
False    61169
Name: count, dtype: int64

In [16]:
test_data['real'].value_counts()

real
False    65056
True     65056
Name: count, dtype: int64

In [17]:
train_data['legal'].value_counts()

legal
True     1353099
False    1297387
Name: count, dtype: int64

In [18]:
val_data['legal'].value_counts()

legal
True     62464
False    59874
Name: count, dtype: int64

In [19]:
test_data['legal'].value_counts()

legal
True     66440
False    63672
Name: count, dtype: int64

In [20]:
print(len(train_data))

2650486
