# 数据下载创建

In [8]:
import requests
import json
import gzip
import shutil
import time
import os
from io import BytesIO

In [9]:
import requests
import json
import gzip
import shutil
import time
import os
from io import BytesIO

S3_BUCKET_URL = "https://vcthackathon-data.s3.us-west-2.amazonaws.com"

# (game-changers, vct-international, vct-challengers)
LEAGUE = "game-changers"

# (2022, 2023, 2024)
YEAR = 2022

def download_gzip_and_write_to_json(file_name):
    if os.path.isfile(f"{file_name}.json"):
        return False

    remote_file = f"{S3_BUCKET_URL}/{file_name}.json.gz"
    response = requests.get(remote_file, stream=True)

    if response.status_code == 200:
        gzip_bytes = BytesIO(response.content)
        with gzip.GzipFile(fileobj=gzip_bytes, mode="rb") as gzipped_file:
            with open(f"{file_name}.json", 'wb') as output_file:
                shutil.copyfileobj(gzipped_file, output_file)
            print(f"{file_name}.json written")
        return True
    elif response.status_code == 404:
        # Ignore
        return False
    else:
        print(response)
        print(f"Failed to download {file_name}")
        return False


def download_esports_files(LEAGUE, YEAR):
    directory = f"{LEAGUE}/esports-data"

    if not os.path.exists(directory):
        os.makedirs(directory)

    esports_data_files = ["leagues", "tournaments", "players", "teams", "mapping_data"]
    for file_name in esports_data_files:
        download_gzip_and_write_to_json(f"{directory}/{file_name}")


def download_games(LEAGUE, YEAR):
    start_time = time.time()

    local_mapping_file = f"{LEAGUE}/esports-data/mapping_data.json"
    with open(local_mapping_file, "r") as json_file:
        mappings_data = json.load(json_file)

    local_directory = f"{LEAGUE}/games/{YEAR}"
    if not os.path.exists(local_directory):
        os.makedirs(local_directory)

    game_counter = 0

    for esports_game in mappings_data:
        s3_game_file = f"{LEAGUE}/games/{YEAR}/{esports_game['platformGameId']}"
        
        response = download_gzip_and_write_to_json(s3_game_file)
        
        if (response == True):
            game_counter += 1
            if game_counter % 10 == 0:
                print(f"----- 已处理 {game_counter} 场比赛，当前运行时间: {round((time.time() - start_time)/60, 2)} 分钟")


if __name__ == "__main__":
    leagues = ["game-changers", "vct-international", "vct-challengers"]
    years = [2022, 2023, 2024]
    
    for league in leagues:
        for year in years:
            print(f"正在下载 {league} {year} 的数据...")
            # 先下载基本数据关于选手，队伍...
            download_esports_files(league, year)
            # download_games(league, year)
    
    print("所有数据下载完成")

正在下载 game-changers 2022 的数据...
正在下载 game-changers 2023 的数据...
正在下载 game-changers 2024 的数据...
正在下载 vct-international 2022 的数据...
正在下载 vct-international 2023 的数据...
正在下载 vct-international 2024 的数据...
正在下载 vct-challengers 2022 的数据...
正在下载 vct-challengers 2023 的数据...
正在下载 vct-challengers 2024 的数据...
所有数据下载完成


In [10]:
import gzip
import json

def extract_players(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        
    players = []
    for player in data:
        player_info = {
            'id': player['id'],
            'handle': player['handle'],
            'name': f"{player['first_name']} {player['last_name']}",
            'homeTeam': player.get('home_team_id', None),
        }
        players.append(player_info)
    
    return players

# Usage
# file_path = 'game-changers/esports-data/players.json'
# all_players = extract_players(file_path)
# print(f"Number of players: {len(all_players)}")

# # Print or process the players as needed
# for player in all_players:
#     print(player)


file_paths = [
    "game-changers/esports-data/players.json",
    "vct-challengers/esports-data/players.json",
    "vct-international/esports-data/players.json"
]

gameChangersPlayers = []
vctChallengersPlayers = []
vctInternationalPlayers = []

gameChangersPlayers = extract_players(file_paths[0])
print(f"Game Changers 选手数量: {len(gameChangersPlayers)}")

vctChallengersPlayers = extract_players(file_paths[1])
print(f"VCT Challengers 选手数量: {len(vctChallengersPlayers)}")

vctInternationalPlayers = extract_players(file_paths[2])
print(f"VCT International 选手数量: {len(vctInternationalPlayers)}")

# 计算所有联赛的选手总数
total_players = len(gameChangersPlayers) + len(vctChallengersPlayers) + len(vctInternationalPlayers)
print(f"所有联赛的选手总数: {total_players}")

# print players
for player in gameChangersPlayers:
    print(player)


Game Changers 选手数量: 2999
VCT Challengers 选手数量: 7089
VCT International 选手数量: 3254
所有联赛的选手总数: 13342
{'id': '107025876564296044', 'handle': 'Toma', 'name': 'Tommi Lehtinen', 'homeTeam': '106652238530500027'}
{'id': '106977714184829995', 'handle': 'Stefanie', 'name': 'Stefanie Jones', 'homeTeam': '106976771652907805'}
{'id': '107176790303905255', 'handle': 'aRth', 'name': 'Arthur Hardman', 'homeTeam': '107174817570297246'}
{'id': '107282855606202828', 'handle': 'godwana', 'name': 'Engin  Balcı', 'homeTeam': '107021298845350518'}
{'id': '106977742188901804', 'handle': 'Jaxsen', 'name': 'Jackson Popelka', 'homeTeam': '106976774374552352'}
{'id': '106982393485522814', 'handle': 'div', 'name': 'Diana Ordaz', 'homeTeam': '106977314170008685'}
{'id': '106978667986810556', 'handle': 'Enbyus', 'name': 'Pejmon Shariat', 'homeTeam': '106977291531224391'}
{'id': '107604724069883846', 'handle': 'VERYFREAK', 'name': 'Guiliano Fisogni', 'homeTeam': '107605732546045047'}
{'id': '106732694881302524', 'han

In [11]:
# merge players then write to file
all_players = gameChangersPlayers + vctChallengersPlayers + vctInternationalPlayers
print(f"所有选手数量: {len(all_players)}")

file_path = 'all_players.json'
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(all_players, file, ensure_ascii=False, indent=4)

print(f"所有选手已写入 {file_path}")

所有选手数量: 13342
所有选手已写入 all_players.json


# 数据加工，也就是添加选手的property，比如每场比赛选择英雄
## todo
- 是不是有重复的，比如一个选手参加了两个不同的联赛
- 添加选手的property，比如每场比赛选择英雄 → 常用英雄

## Auxiliary functions

In [12]:
# 读取 all_players.json
def load_players(file_path='all_players.json'):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# 给所有选手添加 property，按照传入的函数计算
def add_player_property(players, property_name, calculate_property):
    for player in players:
        player[property_name] = calculate_property(player)
    return players

# 保存选手数据
def save_players(players, file_path='all_players.json'):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(players, file, ensure_ascii=False, indent=4)
    print(f"所有选手数据已更新并保存到 {file_path}")

## modify this

In [13]:
# 示例：计算选手的常用英雄
def calculate_favorite_heroes(player):
    # TODO: 实现计算逻辑
    return []

# 主函数
def main():
    all_players = load_players()
    all_players = add_player_property(all_players, 'favorite_heroes', calculate_favorite_heroes)
    save_players(all_players)

if __name__ == "__main__":
    main()

所有选手数据已更新并保存到 all_players.json
