# Préparer les données 

## 1. Télécharger les données

In [1]:
from downloader import *
url = 'https://database.lichess.org/standard/lichess_db_standard_rated_2025-08.pgn.zst'
chessdataloader = ChessDataDownloader(url=url)
filepath = chessdataloader.download()
output_path = chessdataloader.decompress('lichess_db_standard_rated_2025-08.pgn.zst')

Téléchargement de lichess_db_standard_rated_2025-08.pgn.zst...


100%|██████████| 30.2G/30.2G [06:13<00:00, 80.9MiB/s]

Le fichier décompressé lichess_db_standard_rated_2025-08.pgn existe déjà.





## 2. Filtrer les parties : nb de parties, min elo, max elo, nb max de coups

In [1]:
from tokenizer import *
from parser import *

max_games = 10
min_elo=2000
min_moves = 10
max_moves = 300
filepath = "lichess_db_standard_rated_2025-08.pgn"
ref = "lichess-2025-08"

config = TokenizerConfig(
    pad_token='<PAD>',
    special_tokens=['<START>', '<END>'],
    save_path='./models/chess_tokenizer.json'
)

# Selection et filtrage des parties 
tokenizer = ChessTokenizer(config)
png_parser= PGNParser(tokenizer= tokenizer,min_elo=min_elo, min_moves = min_moves, max_moves = max_moves)
games, valid_games, total_games = png_parser.parse_file(filepath=filepath, max_games = max_games)
metadata = {'max_games' : max_games, 'min_elo':min_elo, 'min_moves': min_moves,'max_moves':max_moves, 'filepath':filepath}
data = {'metadata' : metadata, 'PGNs':games }

# Enregistrement du fichier
with open(f"pgnFiltered_{ref}.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)


Vocabulary size: 46
Number of valid characters: 43
Critères de filtrage:
- Elo minimum: 2000
- Nombre de coups: 10-300
- Temps de contrôle minimum: 3.0 minutes


Parsing PGN: 3160it [00:00, 730569.98it/s]


Statistiques du parsing:
Parties totales: 158
Filtrées par Elo: 120
Filtrées par nombre de coups: 0
Filtrées par temps de contrôle: 28
Format invalide: 0
Parties valides: 10





In [2]:
tokenizer.vocab

{'<PAD>': 0,
 '7': 1,
 'Q': 2,
 '2': 3,
 '3': 4,
 '6': 5,
 'B': 6,
 '4': 7,
 '8': 8,
 'c': 9,
 'K': 10,
 'e': 11,
 'b': 12,
 'a': 13,
 'P': 14,
 '5': 15,
 'g': 16,
 '1': 17,
 'f': 18,
 'd': 19,
 'N': 20,
 'h': 21,
 'R': 22,
 '#': 23,
 '.': 24,
 '=': 25,
 '9': 26,
 'x': 27,
 '-': 28,
 '0': 29,
 '+': 30,
 'O': 31,
 '\n': 32,
 '[': 33,
 '/': 34,
 "'": 35,
 '(': 36,
 ')': 37,
 ' ': 38,
 '\t': 39,
 ',': 40,
 ']': 41,
 '!': 42,
 '"': 43,
 '<START>': 44,
 '<END>': 45}

## 3. Convertir les parties en triplets (fen, coup suivant, coups légaux)

In [3]:
import json
from make_fen import *

ref = "lichess-2025-08"

with open("pgnFiltered_lichess-2025-08.json", "r", encoding="utf-8") as f:
    games = json.load(f)

metadata = games['metadata']
games = games['PGNs']

with open(f"annot_data_{ref}.jsonl", "w", encoding="utf-8") as f:
    for game, pgn in enumerate(games):
        fens, moves, legal_moves = pgn2fen(pgn)
        for coup in range(len(fens)):
            line = {
                'data_ref': ref,
                'id': f'{game}_{coup}',
                'FEN': fens[coup],
                'next_move': moves[coup],
                'legal_moves': legal_moves[coup],
            }
            f.write(json.dumps(line, ensure_ascii=False) + "\n")


## Charger les données annotées enregistrées

In [5]:
import json

data = []
with open(f"annot_data_{ref}.jsonl", "r", encoding="utf-8") as f:
    for ligne in f:
        data.append(json.loads(ligne))
data[5]

{'data_ref': 'lichess-2025-08',
 'id': '0_5',
 'FEN': 'rnbqkb1r/ppp1pppp/5n2/3p4/8/5NP1/PPPPPPBP/RNBQK2R b KQkq - 3 3',
 'next_move': 'e7e6',
 'legal_moves': ['h8g8',
  'e8d7',
  'd8d7',
  'd8d6',
  'c8d7',
  'c8e6',
  'c8f5',
  'c8g4',
  'c8h3',
  'b8d7',
  'b8c6',
  'b8a6',
  'f6g8',
  'f6d7',
  'f6h5',
  'f6g4',
  'f6e4',
  'h7h6',
  'g7g6',
  'e7e6',
  'c7c6',
  'b7b6',
  'a7a6',
  'd5d4',
  'h7h5',
  'g7g5',
  'e7e5',
  'c7c5',
  'b7b5',
  'a7a5']}