In [98]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
from gql import Client, gql
from gql.transport.httpx import HTTPXTransport
import os
from tqdm.auto import tqdm
import pandas as pd

In [100]:
GQL_CLIENT_TIMEOUT = 60
RAW_DATA_PATH = '../data/raw/PL-22-23'


In [101]:
from gandula.providers.pff.api.api_client import execute_query, get_client

api_url = os.getenv('api_url')
api_key = os.getenv('api_key')
client = get_client(api_url, api_key)

# Queries

In [102]:
get_players = """
query rosters($gameId: ID!)  {
  rosters(gameId: $gameId) {
    positionGroupType
    player {
      id
      firstName
      lastName
      nickname
    }
    shirtNumber
    team {
      id
    }
    game {
      id
    }
  }
}
"""

In [103]:
get_game = """
query game($id: ID!) {
  game(id: $id) {
     homeTeam {
      id
      name
    }
    awayTeam {
      id
      name
    }
    competition {
      id
      name
    }
    date
    homeTeamStartLeft
    id
    season
    week
  }
}
"""

In [104]:
get_teams = """
query game($id: ID!) {
  game(id: $id) {
    id
    homeTeam  {
      id
      name
      shortName
      country
      kits {
        name 
        primaryColor
        primaryTextColor
        secondaryColor
        secondaryTextColor
        updatedAt 
      }
    }
  }
}

"""

In [105]:
get_game_players = """
query game($id: ID!) {
  game(id: $id) {
    id
    rosters {
      positionGroupType
      player {
        id
        firstName
        lastName
        nickname
      }
      shirtNumber
      team {
        id
      }
    }
  }
}
"""

In [106]:
games = os.listdir(RAW_DATA_PATH)
game_ids = [game.split('.')[0] for game in games]
print(f"Games to process: {len(game_ids)}")

Games to process: 190


In [107]:
import time

games = []
teams = []
team_ids = []
players = []
player_ids = []

for game_id in tqdm(game_ids, desc='Processing games', total=len(game_ids)):
    variables = {'id': int(game_id)}
    player_variables = {'gameId': int(game_id)}

    # game_result = client.execute(
    #         gql(get_game),
    #         variable_values=variables,
    #         serialize_variables=True,
    #         parse_result=True,
    #     )
    
    # games.append(game_result['game'])

    # teams_result = client.execute(
    #         gql(get_teams),
    #         variable_values=variables,
    #         serialize_variables=True,
    #         parse_result=True,
    #     )
    
    # if teams_result['game']['homeTeam']['id'] not in team_ids:
    #     teams.append(teams_result['game'])
    #     team_ids.append(teams_result['game']['homeTeam']['id'])

    players_result = client.execute(
            gql(get_game_players),
            variable_values=variables,
            serialize_variables=True,
            parse_result=True,
        )
    
    players.append(players_result['game'])

    time.sleep(1)

Processing games:   0%|          | 0/190 [00:00<?, ?it/s]

In [108]:
def games_to_df(games):
    
    games_list = []

    for game in games:

        games_list.append({
            'match_id': int(game['id']),
            'season': game['season'],
            'week': int(game['week']),
            'date': pd.to_datetime(game['date']),
            'home_team_id': int(game['homeTeam']['id']),
            'home_team_name': game['homeTeam']['name'],
            'away_team_id': int(game['awayTeam']['id']),
            'away_team_name': game['awayTeam']['name'],
            'competition_id': int(game['competition']['id']),
            'competition_name': game['competition']['name'],
            'home_team_start_left': game['homeTeamStartLeft']
    
        })

    return pd.DataFrame(games_list).sort_values(by='match_id').reset_index(drop=True)

In [109]:
def teams_to_df(teams):
    
    teams_list = []

    for team in teams:

        kits = team['homeTeam']['kits']  # lista de dicionários

        # 1) Filtrar apenas kits com name=='home'
        home_kits = [k for k in kits if k['name'].lower() == 'home']
        
        if home_kits:
            # Ordenar pelos updatedAt (ISO 8601) em ordem decrescente
            home_kits.sort(key=lambda x: x['updatedAt'], reverse=True)
            chosen_kit = home_kits[0]  # mais recente
        else:
            # 2) Se não existir kit 'home', usar o kit mais recente entre todos
            kits.sort(key=lambda x: x['updatedAt'], reverse=True)
            chosen_kit = kits[0]  # mais recente no geral

        primary_color = chosen_kit['primaryColor']
        secondary_color = chosen_kit['secondaryColor']

        teams_list.append({
            'team_id': int(team['homeTeam']['id']),
            'team_name': team['homeTeam']['name'],
            'team_short_name': team['homeTeam']['shortName'],
            'team_country': team['homeTeam']['country'],
            'primary_color': primary_color,
            'secondary_color': secondary_color
        })

    return pd.DataFrame(teams_list).drop_duplicates(subset=['team_id']).sort_values('team_id').reset_index(drop=True)

In [110]:
def players_to_df(players_game_rosters):
    
    players_list = []
    print(players_game_rosters)



    for game_roster in players_game_rosters:
        match_id = game_roster['id']
        players_rosters = game_roster['rosters']

        for player in players_rosters:

            players_list.append({
                'player_id': int(player['player']['id']),
                'first_name': player['player']['firstName'],
                'last_name': player['player']['lastName'],
                'nickname': player['player']['nickname'],
                'team_id': int(player['team']['id']),
                'position': player['positionGroupType'],
                'shirt_number': player['shirtNumber'],
                'match_id': int(match_id)
            })

    return pd.DataFrame(players_list).drop_duplicates(subset=['player_id','team_id','match_id']).sort_values('player_id').reset_index(drop=True)

In [111]:
# games_df = games_to_df(games)
# games_df 

In [112]:
# teams_df = teams_to_df(teams)
# teams_df 

KeyError: 'team_id'

In [113]:
players_df = players_to_df(players)
players_df 

[{'id': '4436', 'rosters': [{'player': {'firstName': 'Luka', 'id': '473', 'lastName': 'Milivojevic', 'nickname': 'Luka Milivojevic'}, 'positionGroupType': 'DM', 'shirtNumber': '4', 'team': {'id': '7'}}, {'player': {'firstName': 'Oleksandr', 'id': '15', 'lastName': 'Zinchenko', 'nickname': 'Oleksandr Zinchenko'}, 'positionGroupType': 'LB', 'shirtNumber': '35', 'team': {'id': '2'}}, {'player': {'firstName': 'Eberechi', 'id': '1940', 'lastName': 'Eze', 'nickname': 'Eberechi Eze'}, 'positionGroupType': 'AM', 'shirtNumber': '10', 'team': {'id': '7'}}, {'player': {'firstName': 'Reiss', 'id': '165', 'lastName': 'Nelson', 'nickname': 'Reiss Nelson'}, 'positionGroupType': 'RW', 'shirtNumber': '24', 'team': {'id': '2'}}, {'player': {'firstName': 'Vicente', 'id': '461', 'lastName': 'Guaita', 'nickname': 'Vicente Guaita'}, 'positionGroupType': 'GK', 'shirtNumber': '13', 'team': {'id': '7'}}, {'player': {'firstName': 'Jeffrey', 'id': '477', 'lastName': 'Schlupp', 'nickname': 'Jeffrey Schlupp'}, 'po

Unnamed: 0,player_id,first_name,last_name,nickname,team_id,position,shirt_number,match_id
0,1,Harry,Kane,Harry Kane,17,CF,10,4481
1,1,Harry,Kane,Harry Kane,17,CF,10,4593
2,1,Harry,Kane,Harry Kane,17,CF,10,4544
3,1,Harry,Kane,Harry Kane,17,CF,10,4613
4,1,Harry,Kane,Harry Kane,17,CF,10,4503
...,...,...,...,...,...,...,...,...
7590,15630,Paul,Onuachu,Paul Onuachu,16,CF,12,4504
7591,15632,Victor,Kristiansen,Victor Kristiansen,9,LB,16,4501
7592,16785,João,Gomes,João Gomes,20,CM,35,4502
7593,16786,Yasin,Ayari,Yasin Ayari,4,CM,26,4496


In [114]:
players_df[players_df['match_id'] == 4625]

Unnamed: 0,player_id,first_name,last_name,nickname,team_id,position,shirt_number,match_id
609,72,Serge,Aurier,Serge Aurier,221,RB,24,4625
615,73,Kyle,Walker-Peters,Kyle Walker-Peters,16,RWB,2,4625
723,89,Willy,Caballero,Willy Caballero,16,GK,13,4625
1204,154,Ainsley,Maitland-Niles,Ainsley Maitland-Niles,16,RM,3,4625
1782,230,Willy,Boly,Willy Boly,221,LCB,30,4625
1942,243,Morgan,Gibbs-White,Morgan Gibbs-White,221,AM,10,4625
2283,285,Steve,Cook,Steve Cook,221,RCB,3,4625
2479,314,Sam,Surridge,Sam Surridge,221,CF,16,4625
2575,334,Jack,Colback,Jack Colback,221,DM,8,4625
2923,394,James,Ward-Prowse,James Ward-Prowse,16,CM,8,4625


# Export CSV

In [115]:
#games_df.to_csv('../data/csv/games.csv', index=False)
#teams_df.to_csv('../data/csv/teams.csv', index=False)
players_df.to_csv('../data/csv/players_matches.csv', index=False)