In [2]:
import pandas as pd
from IPython.display import display
from glob import glob
import json
import os
from tqdm import tqdm
from urllib.parse import quote
from time import sleep
import requests
from itertools import cycle
from lxml.html import fromstring

## Check how much player data we have

In [2]:
matches = pd.read_csv("data/opendota/proplayers/promatches.csv")
matches.head(3)

Unnamed: 0,match_id,duration,start_time,radiant_team_id,radiant_name,dire_team_id,dire_name,leagueid,league_name,series_id,series_type,radiant_score,dire_score,radiant_win
0,5307673835,1321,2020-03-22 14:43:25,7748848.0,AsghaR,7787796.0,LEGENDARY,11439,PESC Monthly 2020 Season 1,419196,0,26,6,True
1,5307672307,1568,2020-03-22 14:42:46,5211276.0,Thunder,6593779.0,Gentlemen,11439,PESC Monthly 2020 Season 1,419194,0,42,8,True
2,5307670528,3433,2020-03-22 14:42:00,7764709.0,BeWare,,,11439,PESC Monthly 2020 Season 1,0,0,36,37,False


In [6]:
files = glob("data/opendota/proplayers/pro_matches/*")

players_set = set()

i = 0
for file in tqdm(files):
    with open(file, "r") as f:
        match = json.load(f)
                
    players = match['players']
    
    for player in players:
        account_id = player['account_id']
        players_set.add(account_id)
        if os.path.exists(f"data/opendota/proplayers/pro_players/{account_id}.json"):
            i += 1
            
print(len(players_set))
print(i / (len(files) * 10))

100%|██████████| 10000/10000 [00:31<00:00, 316.13it/s]

7420
0.37897





In [9]:
players_set_pd = pd.Series(list(players_set))
players_set_pd.to_csv("data/opendota/proplayers/players_list.csv", index=False)

## Download all players history

In [15]:
players_set_pd = pd.read_csv("data/opendota/proplayers/players_list.csv")

with tqdm(total=len(players_set_pd)) as pbar: 
    for _, player in players_set_pd.iterrows():
        account_id = player[0]
        url = f"https://api.opendota.com/api/players/{account_id}/matches"
        filename = f"data/opendota/proplayers/players_matches/{account_id}.json"
        
        if os.path.exists(filename):
            pbar.update(1)
            continue
        
        while (True):
            try:
                result = requests.get(url)
                if result.status_code == 200:
                    with open(filename, "w") as f:
                        f.write(result.text)
                    break
                else:
                    sleep(1.0)
            except:
                sleep(1.0)

        pbar.update(1)

100%|██████████| 7420/7420 [2:25:45<00:00,  1.18s/it]   


## Download all players advanced history

In [3]:
with open("utils/player_query.txt", "r") as f:
    query_template = f.read()
    
print(query_template)

https://api.opendota.com/api/players/[[[account_id]]]/matches?significant=0&project=duration&project=game_mode&project=lobby_type&project=start_time&project=hero_id&project=start_time&project=version&project=kills&project=deaths&project=assists&project=skill&project=leaver_status&project=party_size&project=gold_per_min&&project=gold_per_min&project=xp_per_min&project=denies&project=hero_damage&project=tower_damage&project=hero_healing&project=stuns&project=tower_kills&project=neutral_kills&project=courier_kills&project=actions_per_min&project=kda&project=level


In [4]:
players_set_pd = pd.read_csv("data/opendota/proplayers/players_list.csv")

with tqdm(total=len(players_set_pd)) as pbar: 
    for _, player in players_set_pd.iterrows():
        account_id = player[0]
        filename = f"data/opendota/proplayers/players_matches_full/{account_id}.json"
        
        if os.path.exists(filename):
            pbar.update(1)
            continue

        url = query_template.replace("[[[account_id]]]", str(account_id))
            
        while (True):
            try:
                result = requests.get(url)
                if result.status_code == 200:
                    with open(filename, "w") as f:
                        f.write(result.text)
                    break
                else:
                    sleep(1.0)
            except:
                sleep(1.0)

        pbar.update(1)

100%|██████████| 7420/7420 [2:39:18<00:00,  1.29s/it]   


## List unique matches 

In [20]:
files = glob("data/opendota/proplayers/players_matches/*")
unique_matches = set()
for file in tqdm(files):
    with open(file, "r") as f:
        matches = json.load(f)
        
    for i, match in enumerate(matches):
        match_id = match['match_id']
        unique_matches.add(match_id)
        
        if i >= 100:
            break
        
print(len(unique_matches))

100%|██████████| 7420/7420 [02:09<00:00, 57.20it/s] 

500789





In [19]:
print(len(list(unique_matches)))

19383114


## Calculate simple per player stats

In [3]:
files = glob("data/opendota/proplayers/players_matches/*")
players = pd.DataFrame()

for file in tqdm(files):
    with open(file, "r") as f:
        matches = json.load(f)
    
    account_id=file.split("/")[-1].split(".")[0]
    num_matches = len(matches)
    kills = 0
    deaths = 0
    assists = 0
    wins = 0
        
    for i, match in enumerate(matches):
        kills += match['kills']
        deaths += match['deaths']
        assists += match['assists']
        if match['player_slot'] < 128 and match['radiant_win']:
            wins += 1
        if match['player_slot'] >= 128 and not match['radiant_win']:
            wins += 1
            
    kills /= num_matches
    deaths /= num_matches
    assists /= num_matches
    wins /= num_matches
    
    players = players.append({
        "account_id": account_id,
        "kills": kills,
        "deaths": deaths,
        "wins": wins,
        "num_matches": num_matches
    }, ignore_index=True)

100%|██████████| 7420/7420 [05:26<00:00, 22.74it/s]


In [None]:
players.to_csv("data/opendota/proplayers/players_simplestats.csv", index=False)

## Calculate advanced per player stats

In [3]:
files = glob("data/opendota/proplayers/players_matches_full/*")
players = pd.DataFrame()

for file in tqdm(files):
    #file="data/opendota/proplayers/players_matches_full/1000149691.json"
    matches = pd.read_json(file, "r")
    account_id=file.split("/")[-1].split(".")[0]
    num_matches = len(matches)
    
    matches['win'] = (matches['player_slot'] < 128) == matches['radiant_win']
    
    # clean data
    matches_clean = matches.drop([
        "match_id",
        "player_slot",
        "radiant_win",
        "start_time",
        "lobby_type",
        "game_mode",
        "leaver_status",
        "hero_id",
        "version",
        "skill",
        "party_size"
    ], axis=1)

    entry = {
        'account_id': account_id,
        'num_matches': num_matches,
        **(matches_clean.mean().to_dict())
    }
    
    players = players.append(entry, ignore_index=True)

  4%|▎         | 276/7420 [00:30<13:20,  8.93it/s]


KeyboardInterrupt: 

In [31]:
players = players.fillna(0)
players.to_csv("data/opendota/proplayers/players_advstats.csv", index=False)

In [9]:
players_stats = pd.read_csv("data/opendota/proplayers/players_advstats.csv")
display(players_stats.head(1))

Unnamed: 0,account_id,actions_per_min,assists,courier_kills,deaths,denies,duration,gold_per_min,hero_damage,hero_healing,kda,kills,level,neutral_kills,num_matches,stuns,tower_damage,tower_kills,win,xp_per_min
0,174908375,245.116531,12.814627,0.044655,7.41872,8.527441,2434.257594,466.025029,21142.815934,471.309341,3.029201,9.838222,20.806805,56.136671,7672.0,31.356261,2879.137225,1.694181,0.506778,559.934037


In [17]:
NUM_MATCHES = 10_000

matches_latest = matches.sort_values(by='match_id', ascending=False)

result_df = pd.DataFrame()
files = glob("data/opendota/proplayers/pro_matches/*")
for file in tqdm(files):
    with open(file, "r") as f:
        match = json.load(f)

    if match['match_id'] > 5237144928:
        continue
    
    entry = {
        "rad_won": bool(match['radiant_win'])
    }
    
    players_rad = pd.DataFrame()
    players_norad = pd.DataFrame()
    for player in match['players']:
        account_id = player["account_id"]
        slot = player['player_slot']
        
        stats = (players_stats[players_stats['account_id'] == account_id]
            .drop(['account_id'], axis=1).iloc[0])
        
        if slot < 128:
            players_rad = players_rad.append(stats, ignore_index=True)
        else:
            players_norad = players_norad.append(stats, ignore_index=True)
        
        #stats = {f"{slot}_{key}": value for key, value in stats.items() }
        
        #entry = {**entry, **stats}
    players_rad.sort_values('num_matches', ascending=False, inplace=True)
    players_norad.sort_values('num_matches', ascending=False, inplace=True)
    
    i = 0
    # Rad
    for _, player in players_rad.iterrows():
        stats = player.to_dict()
        stats = {f"{i}_{key}": value for key, value in stats.items() }
        
        entry = {**entry, **stats}
        i += 1
        
    # Norad
    for _, player in players_norad.iterrows():
        stats = player.to_dict()
        stats = {f"{i}_{key}": value for key, value in stats.items() }
        
        entry = {**entry, **stats}
        i += 1
        
    result_df = result_df.append(entry, ignore_index=True)

100%|██████████| 10000/10000 [04:27<00:00, 37.33it/s]


In [18]:
result_df.to_csv("datasets/dataset_adv.csv", index=False)

## Generate dataset simple

In [15]:
NUM_MATCHES = 10_000

matches_latest = matches.sort_values(by='match_id', ascending=False)

result_df = pd.DataFrame()

num_exported = 0

files = glob("data/opendota/proplayers/pro_matches/*")
with tqdm(total=NUM_MATCHES) as pbar: 
    for i, values in matches_latest.iterrows():
        match_id = values['match_id']
        
        players_list = players_cleared[(players_cleared['match_id'] == match_id) & (players_cleared['account_id'] != 0)].sort_values('player_slot')
        hist_players = players_cleared[(players_cleared['match_id'] < match_id) & (players_cleared['account_id'].isin(players_list['account_id']))]
        
        stats = hist_players.drop(['match_id', 'hero_id', 'player_slot'], axis=1).groupby('account_id').mean()
        
        if len(stats) < 6:
            continue
        
        entry = {
            "rad_won": values['radiant_win']
        }
        
        for acc_id, acc_means in stats.iterrows():
            slot = players_list[players_list['account_id'] == acc_id]['player_slot'].iloc[0]
            for key, val in acc_means.iteritems():
                entry[f"{slot}_{key}"] = val
                            
        result_df = result_df.append(entry, ignore_index = True)
        
        num_exported += 1
        pbar.update(1)
        
        if num_exported >= NUM_MATCHES:
            break

  0%|          | 0/10000 [00:00<?, ?it/s]


NameError: name 'players_cleared' is not defined