Info for the chess.com api: https://www.chess.com/club/chess-com-developer-community

# Dependencies

In [None]:
import pickle
import requests as rq
import time 
import pandas as pd
from IPython.display import display
from datetime import datetime
import chess.pgn
import io
import os

# Scrape all GM Player Profiles/Stats/Game_Urls

In [None]:


#Start time
time_start = time.time()
print(time_start)
# base url from chess.com api
base_url = 'https://api.chess.com/pub/'

# Chess player titles
titles = ['GM']

#Create empty list for titled player usernames
titled_players =[]

# Create a for loop to get all titled players
for t in titles:
    #Cretae the title player url
    url = base_url+'titled/{}'.format(t)
    print(url)
    
    # Create a temporary dataframe
    temp_df =pd.DataFrame()
    temp_df['Player']=rq.get(url).json()['players']
    temp_df['Title']=t
    titled_players .append(temp_df)
    time.sleep(0.5)

# Concatenate all titled_player dfs in one dataframe
titled_players_df = pd.concat(titled_players)
# Save it to a pickle file
titled_players_df.to_pickle('Titled_Players.pkl')

#Create empty lists for player profile,player stats and player games data
player_profile_dfs=[]
player_stats_df = []
player_games = []

#Create a for loop to scrape the data
for i,t in zip(titled_players_df['Player'],titled_players_df['Title']):
    
    player_url = 'https://api.chess.com/pub/player/{}'.format(i)
    print(i,rq.get(player_url))
    
    player_json =rq.get(player_url).json()
    player_json['last_online']=datetime.fromtimestamp(player_json['last_online']).date()
    player_json['joined']=datetime.fromtimestamp(player_json['joined']).date()
    #player_json['title']=t
    player_profile_dfs.append(pd.DataFrame(player_json, index=[i]))
    time.sleep(0.25)
    
    player_stats_url ='https://api.chess.com/pub/player/{}/stats'.format(i)
    player_stats_json = rq.get(player_stats_url).json()

    for stat in player_stats_json.keys():
        l=['tactics','lessons','puzzle_rush']
        if stat not in l:
            try:
                data ={'Type':stat,
                       'Current_Rating':player_stats_json[stat]['last']['rating'],
                      'Current_Rating_Date':datetime.fromtimestamp(player_stats_json[stat]['last']['date']).date(),
                      'RD':player_stats_json[stat]['last']['rd'],
                      'Best_Rating':player_stats_json[stat]['best']['rating'],
                      'Best_Rating_Date':datetime.fromtimestamp(player_stats_json[stat]['best']['date']).date(),
                      'Win':player_stats_json[stat]['record']['win'],
                      'Loss':player_stats_json[stat]['record']['loss'],
                      'Draw':player_stats_json[stat]['record']['draw']}
                temp_df = pd.DataFrame(data,index=[i])
                player_stats_df.append(temp_df)
            except:
                pass

    time.sleep(0.25)
    
    player_game_archives_url = 'https://api.chess.com/pub/player/{}/games/archives'.format(i)
    player_game_archives_json = rq.get(player_game_archives_url).json()
    
    temp_df=pd.DataFrame()
    temp_df['Games']=player_game_archives_json['archives']
    temp_df['Player']=i
    player_games.append(temp_df)
    time.sleep(0.25)

player_profile_df=pd.concat(player_profile_dfs)
player_profile_df.to_pickle('Player_Profile.pkl')

player_stat_df=pd.concat(player_stats_df)
player_stat_df.to_pickle('Player_Stats.pkl')

player_games_df=pd.concat(player_games)
player_games_df.to_pickle('Player_Games.pkl')

time_stop =time.time()
print('Time Elapsed:{} seconds'.format(round(time_stop-time_start,2)))

# Scrape all GM Game PGNs and metadata

In [None]:
time_start = time.time()
print('Time started: {}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_start))))

try:
    os.mkdir('Player_Games')
except:
    pass

path = 'Player_Games'

df= pd.read_pickle('Player_Games.pkl')
player_profile_df = pd.read_pickle('Player_Profile.pkl')
games_df=[]

for player in df['Player'].unique()[0:1400]:
    reqs = []
    n_games =[]
    p_time_start = time.time()
    if '{}.csv'.format(player) in os.listdir(path):
        print('{} player data have already been collected.'.format(player))
    else:
        player_df = df.loc[df['Player']==player]
        
        player_games=[]
        
        for game in player_df['Games']:
            try:
                games = rq.get(game).json()['games']
                reqs.append(1)
                for g in games:
                    try:
                        n_games.append(1)
                        pgn = io.StringIO(g['pgn'])
                        pgn = chess.pgn.read_game(pgn)
                        if len(pgn.headers.keys())==21:
                            data={}
                            data['player']=player
                            data['player_name']=player_profile_df.loc[player]['name']
                            data['url']=g['url']
                            try: 
                                data['white_Accuracy']=g['accuracies']['white']
                                data['black_Accuracy']=g['accuracies']['black']
                            except:
                                data['white_Accuracy']='-'
                                data['black_Accuracy']='-'

                            for h in pgn.headers.keys():
                                data[h]=pgn.headers[h]
                            data['pgn']=g['pgn']
                            data['ECOUrl']= data['ECOUrl'].replace('https://www.chess.com/openings/','').replace('-',' ')
                            if data['White']== player:
                                data['player_rating']=data['WhiteElo']
                            else:
                                data['player_rating']=data['BlackElo']
                            temp_df = pd.DataFrame(data,index=[data['url']])

                            player_games.append(temp_df)

                        else:
                            pass
                    except:
                        pass

                time.sleep(0.25)

            except:
                time.sleep(150)
        if len(player_games)>0:
            temp_df = pd.concat(player_games)
            temp_df.to_csv('Player_Games/{}.csv'.format(player))
        else:
            print('No games found for player: {}'.format(player))
        p_time_stop =time.time()
        print('{} player data collection completed in {} seconds, total requests: {}, total_games:{}'.format(player,
                                                                                                             round(p_time_stop-p_time_start,2),
                                                                                                            len(reqs),
                                                                                                            len(n_games)))
        
        time.sleep(0.5)
    
time_stop =time.time()

print('Time Finished: {}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_stop))))

# Merge data in a single dataframe

In [None]:
path = 'Player_Games'
games_df = []

for i in os.listdir(path):
    temp_df = pd.read_csv('Player_Games/{}'.format(i))
    temp_df=temp_df[['player','player_name','url','white_Accuracy','black_Accuracy',
                    'Event','Site','Date','White', 'Black','Result', 'BlackElo',
                    'ECO', 'ECOUrl', 'EndDate','EndTime','Termination', 'TimeControl',
                    'WhiteElo', 'pgn', 'player_rating']]

    games_df.append(temp_df)

df = pd.concat(games_df)
df.to_csv('GM_games_dataset.csv')