# Imports

In [1]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# magic lines
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
#for api call
import requests

#regex for extracting start time from pgn string
import re 

#for file i/o
import json
import pickle
import os

# Get the data

## Get one month

In [3]:
#https://www.chess.com/clubs/forum/view/error-403-in-member-profile?page=2
#r.requests.get(url, headers = {'User-Agent': 'username: myaccount, email: my@email'}
params = {
    "User-Agent" : 'username: JammyNinja,  email: jammyninja95@gmail.com'
    } 

In [4]:
#https://api.chess.com/pub/player/jammyninja/games/2024/05
url = 'https://api.chess.com/pub/player/jammyninja/games/2024/05'
response = requests.get(url,headers=params)

In [5]:
print(response.json())

{'games': [{'url': 'https://www.chess.com/game/live/108328937747', 'pgn': '[Event "Live Chess"]\n[Site "Chess.com"]\n[Date "2024.05.01"]\n[Round "-"]\n[White "RCFurtado"]\n[Black "JammyNinja"]\n[Result "1-0"]\n[CurrentPosition "6r1/4Qk2/2p3N1/5P2/p2P1B2/P1P5/8/7K b - -"]\n[Timezone "UTC"]\n[ECO "B01"]\n[ECOUrl "https://www.chess.com/openings/Scandinavian-Defense-Mieses-Kotrc-Main-Line-4.Bc4-Nf6-5.Nf3"]\n[UTCDate "2024.05.01"]\n[UTCTime "19:02:03"]\n[WhiteElo "1259"]\n[BlackElo "1216"]\n[TimeControl "180+2"]\n[Termination "RCFurtado won by checkmate"]\n[StartTime "19:02:03"]\n[EndDate "2024.05.01"]\n[EndTime "19:10:41"]\n[Link "https://www.chess.com/game/live/108328937747"]\n\n1. e4 {[%clk 0:03:01.3]} 1... d5 {[%clk 0:03:00.6]} 2. exd5 {[%clk 0:03:02]} 2... Qxd5 {[%clk 0:03:01.1]} 3. Nc3 {[%clk 0:03:03.9]} 3... Qa5 {[%clk 0:03:00.8]} 4. Bc4 {[%clk 0:03:03.7]} 4... Nf6 {[%clk 0:03:00.3]} 5. Nf3 {[%clk 0:03:03.6]} 5... Bf5 {[%clk 0:02:58]} 6. d4 {[%clk 0:02:48.3]} 6... e6 {[%clk 0:02:51.5

## Get one game

In [6]:
len(response.json()["games"])

test_game = response.json()['games'][3]
# test_game = all_games_list[2251]
test_game

{'url': 'https://www.chess.com/game/live/108330639047',
 'pgn': '[Event "Live Chess"]\n[Site "Chess.com"]\n[Date "2024.05.01"]\n[Round "-"]\n[White "JammyNinja"]\n[Black "Cedcedcedcedced"]\n[Result "0-1"]\n[CurrentPosition "r1b3k1/7r/3p2p1/6K1/7q/8/6P1/1q6 w - -"]\n[Timezone "UTC"]\n[ECO "B23"]\n[ECOUrl "https://www.chess.com/openings/Closed-Sicilian-Defense-Traditional-Line-3.Bc4"]\n[UTCDate "2024.05.01"]\n[UTCTime "19:27:12"]\n[WhiteElo "1293"]\n[BlackElo "1286"]\n[TimeControl "60"]\n[Termination "Cedcedcedcedced won by checkmate"]\n[StartTime "19:27:12"]\n[EndDate "2024.05.01"]\n[EndTime "19:28:57"]\n[Link "https://www.chess.com/game/live/108330639047"]\n\n1. e4 {[%clk 0:01:00]} 1... c5 {[%clk 0:01:00]} 2. Nc3 {[%clk 0:00:59.6]} 2... Nc6 {[%clk 0:00:59.4]} 3. Bc4 {[%clk 0:00:58.4]} 3... d6 {[%clk 0:00:58.7]} 4. d3 {[%clk 0:00:57.9]} 4... g6 {[%clk 0:00:57.9]} 5. a3 {[%clk 0:00:57.4]} 5... Bg7 {[%clk 0:00:57.1]} 6. Be3 {[%clk 0:00:55.8]} 6... Nf6 {[%clk 0:00:56.4]} 7. Nd5 {[%clk 0:00

## Extract relevant data from one game

In [7]:
def game_to_dict(game):
    """
        take raw game dictionary as input 
        output dictionary ready to be dataframed
    """
    #get the moves
    game_pgn = game["pgn"]
    moves = "".join(game_pgn.split("\n\n")[1].split("}")[:-1])
    result = game_pgn.split("}")[-1].strip()
    
    #opening (eco = Encyclopaedia of Chess Openings) 
    eco_pattern = r'\[ECO\s+"[^"]*"\]'
    opening_exists = True if len(re.findall(eco_pattern, game_pgn)) >= 1 else False
    if opening_exists:
        eco_url_pattern = r'\[ECOUrl\s+"[^"]*"\]' #[ECOUrl xyz]
        opening_code = re.search(r'"[^"]*"',re.search(eco_pattern, game_pgn)[0])[0].strip('"')
        opening_name = re.search(eco_url_pattern, game_pgn)[0].split('/')[-1][:-2]
    else:
        opening_code = np.nan
        opening_name = np.nan
    del opening_exists #refactor
    
    #get meta/admin
    url = game["url"]
    final_position_fen = game["fen"]
    rated = game["rated"]
    time_control = game["time_control"]
    time_class = game["time_class"]
    rules = game["rules"] #to exclude any chess960 or other that may have been played
    
    #extract start/end datetime
    end_time = pd.to_datetime(game["end_time"], unit="s")
    date = end_time.date()
    time_pattern = r'\[StartTime\s+"(\d{2}:\d{2}:\d{2})"\]'
    start_time = re.search(time_pattern, game_pgn)[1]
    #wrong date for daily games #refactor
    start_time = pd.to_datetime(str(date) + " " + start_time)
    
    #extract players info
    white_rating = game["white"]["rating"]
    white_result = game["white"]["result"]
    white_username = game["white"]["username"]
    
    black_rating = game["black"]["rating"]
    black_result = game["black"]["result"]
    black_username = game["black"]["username"]
    
    #wrap it up for output
    game_dict = {}
    vars_to_loop = [var for var in locals() 
                    if not (var.startswith("game") or var.endswith("pattern"))]
    
    for variable in vars_to_loop:
        game_dict[variable] = locals()[variable]
        
    return game_dict

# test_game_dict = game_to_dict(all_games_list[2251])
test_game_dict = game_to_dict(test_game)
test_game_dict

{'moves': '1. e4 {[%clk 0:01:00] 1... c5 {[%clk 0:01:00] 2. Nc3 {[%clk 0:00:59.6] 2... Nc6 {[%clk 0:00:59.4] 3. Bc4 {[%clk 0:00:58.4] 3... d6 {[%clk 0:00:58.7] 4. d3 {[%clk 0:00:57.9] 4... g6 {[%clk 0:00:57.9] 5. a3 {[%clk 0:00:57.4] 5... Bg7 {[%clk 0:00:57.1] 6. Be3 {[%clk 0:00:55.8] 6... Nf6 {[%clk 0:00:56.4] 7. Nd5 {[%clk 0:00:54.3] 7... O-O {[%clk 0:00:55.2] 8. Nxf6+ {[%clk 0:00:53.1] 8... Bxf6 {[%clk 0:00:54.7] 9. c3 {[%clk 0:00:53] 9... Qb6 {[%clk 0:00:52.8] 10. b4 {[%clk 0:00:51.5] 10... Bxc3+ {[%clk 0:00:50.7] 11. Bd2 {[%clk 0:00:48.7] 11... Bxa1 {[%clk 0:00:50.5] 12. Qxa1 {[%clk 0:00:48.5] 12... cxb4 {[%clk 0:00:49.2] 13. axb4 {[%clk 0:00:46.1] 13... Nxb4 {[%clk 0:00:48.5] 14. Bh6 {[%clk 0:00:40] 14... e5 {[%clk 0:00:44.4] 15. Nf3 {[%clk 0:00:37.3] 15... Nc2+ {[%clk 0:00:42.5] 16. Ke2 {[%clk 0:00:33.7] 16... Nxa1 {[%clk 0:00:42.4] 17. Rxa1 {[%clk 0:00:33.2] 17... Qb2+ {[%clk 0:00:41.4] 18. Ke3 {[%clk 0:00:32] 18... Qxa1 {[%clk 0:00:41.3] 19. Ng5 {[%clk 0:00:31.7] 19... Qd4+ {[

## Get all games into df

### preparatory functions for api call

In [8]:
def make_api_dates(start_date="2023-01-01", end_date="2024-06-01"):
    """ returns list of tuples where the sub dicts contain year and month
        eg: [('06', 2023), ('07', '2023')]
    """
    out_dates = []
    month_map = {1: '01', 2: '02', 3: '03', 4: '04', 5: '05', 6: '06',
                 7: '07', 8: '08', 9: '09', 10: '10', 11: '11', 12: '12' }
   
    for date in pd.date_range(start=start_date, end=end_date,freq='M'):
        month = month_map[date.month]
        year  = str(date.year)
        out_dates.append((month, year))
        
    return out_dates
make_api_dates()

[('01', '2023'),
 ('02', '2023'),
 ('03', '2023'),
 ('04', '2023'),
 ('05', '2023'),
 ('06', '2023'),
 ('07', '2023'),
 ('08', '2023'),
 ('09', '2023'),
 ('10', '2023'),
 ('11', '2023'),
 ('12', '2023'),
 ('01', '2024'),
 ('02', '2024'),
 ('03', '2024'),
 ('04', '2024'),
 ('05', '2024')]

In [9]:
def make_api_request(month, year):
    """ gets all user games from that month
        calls chess.com API endpoint below
        eg: https://api.chess.com/pub/player/jammyninja/games/2024/05
    """
    #weird params, thanks to 
    #https://www.chess.com/clubs/forum/view/error-403-in-member-profile?page=2
    params = {"User-Agent" : 'username: JammyNinja,  email: jammyninja95@gmail.com' } 
    
    url = f'https://api.chess.com/pub/player/jammyninja/games/{year}/{month}'
    response_json = requests.get(url,headers=params).json()
    
    #json object contains only one key, which is games
    return response_json["games"]
    

## 🚨 Execute api calls 🚨

In [10]:
def get_all_games_list(username="JammyNinja", start_date="2023-01-01", end_date="2024-06-01"):
    """
        Returns list of all games between start month and end month
    """
    months_to_loop = make_api_dates(start_date, end_date)
    all_games = []
    
    for month, year in months_to_loop:
        print("getting", month, year)
        games = make_api_request(month,year)
        all_games += games
    
    print(f"Downloaded a total of {len(all_games)} games.")
    
    return all_games

all_games_list = get_all_games_list()

getting 01 2023
getting 02 2023
getting 03 2023
getting 04 2023
getting 05 2023
getting 06 2023
getting 07 2023
getting 08 2023
getting 09 2023
getting 10 2023
getting 11 2023
getting 12 2023
getting 01 2024
getting 02 2024
getting 03 2024
getting 04 2024
getting 05 2024
Downloaded a total of 3208 games.


### convert list to df

In [11]:
def all_games_list_to_df(all_games_list):
    all_games_df = pd.DataFrame()
    clean_games = []
    for game in all_games_list:
        clean_game = game_to_dict(game)
        clean_games.append(clean_game)
    out_df = pd.DataFrame.from_dict(clean_games)
    
    col_order = ['date','start_time','end_time', 
                 'time_class', 'time_control', 'rated', 'rules', 'url', 
                 'moves', 'final_position_fen',
                 'opening_code', 'opening_name',
                 'white_username', 'black_username',
                 'result',
                 'white_rating', 'black_rating',
                 'white_result', 'black_result']
    
    return out_df[col_order]
    
all_games_df= all_games_list_to_df(all_games_list)
all_games_df

Unnamed: 0,date,start_time,end_time,time_class,time_control,rated,rules,url,moves,final_position_fen,opening_code,opening_name,white_username,black_username,result,white_rating,black_rating,white_result,black_result
0,2023-01-07,2023-01-07 08:17:06,2023-01-07 14:35:53,daily,1/604800,True,chess,https://www.chess.com/game/daily/437377841,1. e4 {[%clk 158:31:38] 1... c6 {[%clk 163:21:...,2r1k2r/pp3ppp/q7/4pb2/1b1P4/1P2B3/1P3PPP/K2R2N...,B10,Caro-Kann-Defense-Hillbilly-Attack-2...d5,sylvainau,JammyNinja,0-1,1298,1461,checkmated,win
1,2023-01-14,2023-01-14 11:34:48,2023-01-14 15:09:28,daily,1/604800,True,chess,https://www.chess.com/game/daily/448245095,1. e4 {[%clk 167:35:57] 1... c5 {[%clk 165:23:...,r2qr3/1b2bpkp/p2p1n2/1p4B1/3Q4/2P2N2/PP3PPP/3R...,B22,Alapin-Sicilian-Defense-2...Nc6-3.d4,JammyNinja,Timkee,0-1,1429,1234,timeout,win
2,2023-01-30,2023-01-30 17:32:36,2023-01-30 07:50:02,daily,1/604800,True,chess,https://www.chess.com/game/daily/436095119,1. e4 {[%clk 167:38:55] 1... c6 {[%clk 163:21:...,rr4k1/3n2pp/2p2p2/p2pp3/P7/1P5P/1RPPNPP1/5RK1 ...,B10,Caro-Kann-Defense-2.Nf3-d5-3.exd5-cxd5,ELNAHAS4,JammyNinja,0-1,1427,1449,timeout,win
3,2023-01-30,2023-01-30 21:29:25,2023-01-30 21:39:52,blitz,300,True,chess,https://www.chess.com/game/live/68853253455,1. Nf3 {[%clk 0:04:54.9] 1... d5 {[%clk 0:04:5...,8/8/4K3/8/8/8/4Q3/6k1 b - -,A09,Reti-Opening-Reti-Gambit-Accepted,ivangard57,JammyNinja,1-0,1022,1030,win,timeout
4,2023-02-07,2023-02-07 20:18:19,2023-02-07 19:25:50,daily,1/604800,True,chess,https://www.chess.com/game/daily/472634081,1. c4 {[%clk 164:52:18] 1... e5 {[%clk 125:36:...,rnbqk1nr/pppp1ppp/8/2b1p3/2P5/2N2P2/PP1PP1PP/R...,A21,English-Opening-Reversed-Sicilian-Variation,ajbouchie,JammyNinja,1/2-1/2,1584,1449,agreed,agreed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3203,2024-05-31,2024-05-31 12:33:22,2024-05-31 12:35:40,bullet,60,True,chess,https://www.chess.com/game/live/110898007115,1. e4 {[%clk 0:01:00] 1... d5 {[%clk 0:00:59.9...,8/8/8/RQ6/1k6/4N3/P5PK/3N4 b - -,B01,Scandinavian-Defense-Mieses-Kotrc-Main-Line-4....,Ingbing,JammyNinja,1-0,1356,1350,win,checkmated
3204,2024-05-31,2024-05-31 12:35:50,2024-05-31 12:37:37,bullet,60,True,chess,https://www.chess.com/game/live/110898034915,1. e4 {[%clk 0:01:00] 1... e6 {[%clk 0:01:00] ...,r2q1rk1/p3bppp/1p6/3RP3/1P3NQn/P7/1B3PPP/2R3K1...,C00,French-Defense-Queens-Knight-Variation-2...d5,JammyNinja,nhathuy82vn,1-0,1358,1360,win,timeout
3205,2024-05-31,2024-05-31 12:37:42,2024-05-31 12:39:28,bullet,60,True,chess,https://www.chess.com/game/live/110898057561,1. e4 {[%clk 0:01:00] 1... e5 {[%clk 0:01:00] ...,6rk/1p6/p2p4/2p5/3pP2q/P2P4/1PP5/R4RK1 w - -,C50,Giuoco-Piano-Game-4.Nc3,JammyNinja,21To25,0-1,1350,1347,checkmated,win
3206,2024-05-31,2024-05-31 13:49:35,2024-05-31 13:51:50,bullet,60,True,chess,https://www.chess.com/game/live/110902287015,1. e4 {[%clk 0:01:00] 1... d5 {[%clk 0:01:00] ...,5Q2/5R1k/7p/3pp2P/3P2p1/1P1N2P1/8/7K b - -,B01,Scandinavian-Defense-2.e5,Competent_Checkmate,JammyNinja,1-0,1356,1342,win,checkmated


### save df as csv

In [12]:
#dynamic query to find earliest/last dates?

filename = "all_games_df_2023-01_to_2024-05.csv"
all_games_df.to_csv(os.path.join("..","data",filename), index=False)

In [17]:
def save_file(df, start_date=None, end_date=None, split_char="-"):
    if not start_date or not end_date:
        filename = "all_games_raw_NO_DATE.csv"
    else:
        start = "-".join(start_date.split(split_char)[:-1])
        end = "-".join(end_date.split(split_char)[:-1])

        filename = f"all_games_raw_{start}_to_{end}.csv"

    df.to_csv(os.path.join("..","data",filename), index=False)
    
start_date="1995-01-01"
end_date="1996-06-01"
save_file(all_games_df)

In [14]:
all_games_df.columns

Index(['date', 'start_time', 'end_time', 'time_class', 'time_control', 'rated',
       'rules', 'url', 'moves', 'final_position_fen', 'opening_code',
       'opening_name', 'white_username', 'black_username', 'result',
       'white_rating', 'black_rating', 'white_result', 'black_result'],
      dtype='object')

### Todo
    
    
ctrl-f 'refactor' for refactoring todo list