### Intro

In [1]:
pwd

'/home/cc/mlops_project/source'

In [2]:
ls

'0. Connecting to VM, creating storage.ipynb'   Untitled.ipynb
'1. Data Preparation.ipynb'                     [0m[01;34mdata[0m/
'1. Extract Live Data (StatsBomb).ipynb'        eval_bash.sh
'2. Player Level Stats.ipynb'                   [01;34mplots[0m/
'3. Visualize Heatmap.ipynb'                    requirements.txt
'4. Player Matching.ipynb'                      [01;31msource.zip[0m


### Import Packages

In [1]:
import pandas as pd
import numpy as np

import json

import matplotlib.pyplot as plt
import seaborn as sns

import sqlite3
from difflib import SequenceMatcher

import re
from unidecode import unidecode

import warnings

warnings.simplefilter(action = 'ignore', category = FutureWarning)
pd.set_option('display.expand_frame_repr', False)

### QA

In [2]:
# Load the data
with open('/mnt/block/data/raw_data/events/events_England.json', 'r') as f:
    data = json.load(f)

# Get unique match IDs
unique_match_ids = set(event['matchId'] for event in data)

# Count them
print("England:", len(unique_match_ids))

# Load the data
with open('/mnt/block/data/raw_data/events/events_Spain.json', 'r') as f:
    data = json.load(f)

# Get unique match IDs
unique_match_ids = set(event['matchId'] for event in data)

# Count them
print("Spain:", len(unique_match_ids))

# Load the data
with open('/mnt/block/data/raw_data/events/events_Italy.json', 'r') as f:
    data = json.load(f)

# Get unique match IDs
unique_match_ids = set(event['matchId'] for event in data)

# Count them
print("Italy:", len(unique_match_ids))

# Load the data
with open('/mnt/block/data/raw_data/events/events_Germany.json', 'r') as f:
    data = json.load(f)

# Get unique match IDs
unique_match_ids = set(event['matchId'] for event in data)

# Count them
print("Germany:", len(unique_match_ids))

# Load the data
with open('/mnt/block/data/raw_data/events/events_France.json', 'r') as f:
    data = json.load(f)

# Get unique match IDs
unique_match_ids = set(event['matchId'] for event in data)

# Count them
print("France:", len(unique_match_ids))

# Load the data
with open('/mnt/block/data/raw_data/events/events_European_Championship.json', 'r') as f:
    data = json.load(f)

# Get unique match IDs
unique_match_ids = set(event['matchId'] for event in data)

# Count them
print("European Championship:", len(unique_match_ids))

# Load the data
with open('/mnt/block/data/raw_data/events/events_World_Cup.json', 'r') as f:
    data = json.load(f)

# Get unique match IDs
unique_match_ids = set(event['matchId'] for event in data)

# Count them
print("World Cup:", len(unique_match_ids))

England: 380
Spain: 380
Italy: 380
Germany: 306
France: 380
European Championship: 51
World Cup: 64


### Tags and Events

In [3]:
tags   = pd.read_csv('/mnt/block/data/raw_data/tags2name.csv')
events = pd.read_csv('/mnt/block/data/raw_data/eventid2name.csv')

# Display the first few rows
print(tags)
print("-" * 80)
print(events)

     Tag                Label                  Description
0    101                 Goal                         Goal
1    102             own_goal                     Own goal
2    301               assist                       Assist
3    302              keyPass                     Key pass
4   1901       counter_attack               Counter attack
5    401                 Left                    Left foot
6    402                Right                   Right foot
7    403            head/body                    Head/body
8   1101               direct                       Direct
9   1102             indirect                     Indirect
10  2001  dangerous_ball_lost          Dangerous ball lost
11  2101              blocked                      Blocked
12   801                 high                         High
13   802                  low                          Low
14  1401         interception                 Interception
15  1501            clearance                    Clearan

### Match event detail

In [None]:
files = [
    '/mnt/block/data/raw_data/events/events_England.json',
    '/mnt/block/data/raw_data/events/events_Italy.json',
    '/mnt/block/data/raw_data/events/events_Germany.json',
    '/mnt/block/data/raw_data/events/events_France.json',
    '/mnt/block/data/raw_data/events/events_Spain.json',
    '/mnt/block/data/raw_data/events/events_European_Championship.json',
    '/mnt/block/data/raw_data/events/events_World_Cup.json'
]

columns = [
    "matchId", "matchPeriod", "eventSec", "eventId", "eventName", "subEventId", "subEventName",
    "tags", "id", "playerId", "teamId", "positions"
]

# Function to adjust event time
def adjust_event_time(row):
    if row['matchPeriod'] == '2H':
        return row['eventSec'] + 2700
    elif row['matchPeriod'] == 'E1':
        return row['eventSec'] + 5400
    elif row['matchPeriod'] == 'E2':
        return row['eventSec'] + 6300
    elif row['matchPeriod'] == 'P':
        return row['eventSec'] + 7200
    return row['eventSec']

# Load tag descriptions
tag_dict = tags.set_index("Tag")["Description"].to_dict()

# Pass 1: Compute global max number of tags
global_max_tags  = 0
raw_data_by_file = {}

for file in files:
    with open(file, 'r') as f:
        data = json.load(f)
        raw_data_by_file[file] = data
        max_tags               = max(len(event['tags']) for event in data)
        global_max_tags        = max(global_max_tags, max_tags)

# Pass 2: Process all files using global max_tags
all_dfs = []

for file, data in raw_data_by_file.items():
    df             = pd.DataFrame([{col: event.get(col, None) for col in columns} for event in data])
    df['eventSec'] = df.apply(adjust_event_time, axis = 1)

    # Expand tags
    df_tags         = df['tags'].apply(lambda x: x + [None] * (global_max_tags - len(x))).apply(pd.Series)
    df_tags.columns = [f'tag_{i + 1}' for i in range(global_max_tags)]

    # Replace tags with ids
    for i in range(1, global_max_tags + 1):
        df_tags[f'tag_{i}']      = df_tags[f'tag_{i}'].apply(lambda x: x['id'] if isinstance(x, dict) and 'id' in x else None)
        df_tags[f'tag_{i}_name'] = df_tags[f'tag_{i}'].map(tag_dict)

    df = df.drop(columns = ['tags']).join(df_tags)
    all_dfs.append(df)

# Concatenate and save
combined_df = pd.concat(all_dfs, ignore_index = True)

# Fix potentially problematic columns
for col in ['eventId', 'subEventId', 'playerId', 'teamId', 'tag_1', 'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6']:
    if col in combined_df.columns:
        combined_df[col] = pd.to_numeric(combined_df[col], errors = 'coerce')
combined_df.to_parquet("/mnt/block/data/final_datasets/match_detail.parquet", index = False)
print(f"Saved combined events to 'all_events.parquet' with up to {global_max_tags} tag columns.")

### Match summary

In [100]:
# List of match file paths
files = [
    '/mnt/block/data/raw_data/matches/matches_England.json',
    '/mnt/block/data/raw_data/matches/matches_Italy.json',
    '/mnt/block/data/raw_data/matches/matches_Germany.json',
    '/mnt/block/data/raw_data/matches/matches_France.json',
    '/mnt/block/data/raw_data/matches/matches_Spain.json',
    '/mnt/block/data/raw_data/matches/matches_European_Championship.json',
    '/mnt/block/data/raw_data/matches/matches_World_Cup.json'
]

# Prepare a list to hold all match rows
all_rows = []

# Iterate through each file
for filepath in files:
    with open(filepath, 'r') as f:
        data = json.load(f)

    for match in data:
        # Extract basic fields
        competitionId = match.get('competitionId')
        dateutc       = match.get('dateutc')
        duration      = match.get('duration')
        gameweek      = match.get('gameweek')
        label         = match.get('label')
        roundID       = match.get('roundId')
        seasonId      = match.get('seasonId')
        status        = match.get('status')
        venue         = match.get('venue')
        winner        = match.get('winner')
        Id            = match.get('wyId')
        teamsData     = match.get('teamsData')

        # Initialize team-related fields
        home_team_id, away_team_id           = None, None
        home_score, away_score               = None, None
        home_hasFormation, away_hasFormation = None, None

        for teamId, team in teamsData.items():
            if team.get('side') == 'home':
                home_team_id = teamId
                home_score   = team.get('score')
                home_hasFormation = team.get('hasFormation')
            elif team.get('side') == 'away':
                away_team_id = teamId
                away_score   = team.get('score')
                away_hasFormation = team.get('hasFormation')

        all_rows.append({
            'dateutc': dateutc,
            'seasonId': seasonId,
            'competitionId': competitionId,
            'roundID': roundID,
            'gameweek': gameweek,
            'duration': duration,
            'matchId': Id,
            'result': label,
            'status': status,
            'venue': venue,
            'winner': winner,
            'home_team_id': home_team_id,
            'home_score': home_score,
            'home_hasFormation': home_hasFormation,
            'away_team_id': away_team_id,
            'away_score': away_score,
            'away_hasFormation': away_hasFormation
        })

# Convert to DataFrame
match_summary_df = pd.DataFrame(all_rows)

# Save as Parquet
match_summary_df.to_parquet("/mnt/block/data/final_datasets/match_summary.parquet", index = False)
print("Saved match summaries to 'match_summary.parquet'")

✅ Saved match summaries to 'match_summary.parquet'


### Match lineups

In [108]:
# Initialize an empty list to hold rows of data
rows = []

# Loop through each file
for filepath in files:
    with open(filepath, 'r') as f:
        data = json.load(f)
    
    for match in data:
        match_id     = match['wyId']
        home_team    = away_team = None
        home_team_id = away_team_id = None
        
        # Identify home and away teams
        for team_id, team_data in match['teamsData'].items():
            if team_data.get('side') == 'home':
                home_team    = team_data
                home_team_id = team_id
            elif team_data.get('side') == 'away':
                away_team    = team_data
                away_team_id = team_id
        
        # Proceed only if both sides exist and have formation data
        if home_team and away_team and 'formation' in home_team and 'formation' in away_team:
            try:
                home_lineup = [player['playerId'] for player in home_team['formation'].get('lineup', [])]
                home_bench  = [player['playerId'] for player in home_team['formation'].get('bench', [])]
                home_subs   = home_team['formation'].get('substitutions', [])
                home_subs   = home_subs if isinstance(home_subs, list) else []
                
                away_lineup = [player['playerId'] for player in away_team['formation'].get('lineup', [])]
                away_bench  = [player['playerId'] for player in away_team['formation'].get('bench', [])]
                away_subs   = away_team['formation'].get('substitutions', [])
                away_subs   = away_subs if isinstance(away_subs, list) else []
                
                rows.append({
                    'ID': match_id,
                    'home_team': home_team_id,
                    'home_team_lineup': home_lineup,
                    'home_team_bench': home_bench,
                    'home_team_subs': home_subs,
                    'away_team': away_team_id,
                    'away_team_lineup': away_lineup,
                    'away_team_bench': away_bench,
                    'away_team_subs': away_subs
                })
            except Exception as e:
                print(f"⚠️ Skipping match {match_id} due to error: {e}")

# Convert to DataFrame and save
df_lineup = pd.DataFrame(rows)
df_lineup.to_parquet("/mnt/block/data/final_datasets/match_lineup.parquet", index = False)
print("Saved match lineups to 'match_lineup.parquet'")
df_lineup.iloc[0:5]

✅ Saved match lineups to 'match_lineup.parquet'


### Competitions

In [4]:
with open('/mnt/block/data/raw_data/competitions.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame([{
    'name': competition['name'],
    'Id': competition['wyId'],
    'type': competition['type'],
    'format': competition['format'],
    'area': competition['area']['name']
} for competition in data])

# Display the first few rows
print(df.head())
df.to_parquet("/mnt/block/data/final_datasets/competitions.parquet", index = False)

                     name   Id  type           format     area
0  Italian first division  524  club  Domestic league    Italy
1  English first division  364  club  Domestic league  England
2  Spanish first division  795  club  Domestic league    Spain
3   French first division  412  club  Domestic league   France
4   German first division  426  club  Domestic league  Germany


### Coaches

In [117]:
with open('/mnt/block/data/raw_data/coaches.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame with selected columns
df = pd.DataFrame([{
    'firstName': coach.get('firstName'),
    'middleName': coach.get('middleName'),
    'lastName': coach.get('lastName'),
    'Id': coach.get('wyId'),
    'birthDate': coach.get('birthDate'),
    'passportArea_id': coach.get('passportArea', {}).get('id'),
    'passportArea_name': coach.get('passportArea', {}).get('name'),
    'currentTeamId': coach.get('currentTeamId')
} for coach in data])

df.iloc[10:50]
df.to_parquet("/mnt/block/data/final_datasets/coaches.parquet", index = False)

### Teams

In [25]:
with open('/mnt/block/data/raw_data/teams.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame with selected columns
df_team = pd.DataFrame([{'name': team.get('officialName'),
                         'id': team.get('wyId'),
                         'type': team.get('type'),
                         'country_name': team.get('area', {}).get('name'),
                         'country_id': team.get('area', {}).get('id'),
                         'city': team.get('city')
                        } for team in data])

df_team['country_id'] = pd.to_numeric(df_team['country_id'], errors = 'coerce').astype('Int64')

# Display the DataFrame
df_team.to_parquet("/mnt/block/data/final_datasets/teams.parquet", index = False)
df_team.loc[df_team['country_name'] == 'England']

Unnamed: 0,name,id,type,country_name,country_id,city
0,Newcastle United FC,1613,club,England,0,Newcastle upon Tyne
8,Huddersfield Town FC,1673,club,England,0,"Huddersfield, West Yorkshire"
26,AFC Bournemouth,1659,club,England,0,"Bournemouth, Dorset"
27,Brighton & Hove Albion FC,1651,club,England,0,"Brighton, East Sussex"
39,Burnley FC,1646,club,England,0,"Burnley, Lancashire"
54,Leicester City FC,1631,club,England,0,Leicester
55,West Ham United FC,1633,club,England,0,London
56,Stoke City FC,1639,club,England,0,Stoke-on-Trent
60,Watford FC,1644,club,England,0,Watford
65,Everton FC,1623,club,England,0,Liverpool


### Players

#### Functions

In [67]:
# --- Define a function to decode unicode escape sequences ---
def decode_unicode(text):
    """
    Safely decodes unicode escape sequences in a string.
    If the input is not a string, it returns the input as is.
    """
    if not isinstance(text, str):
        return text
    try:
        return bytes(text, 'utf-8').decode('unicode_escape')
    except Exception:
        return text  # In case decoding fails, return original text

# --- Normalize function ---
def normalize_unicode(text):
    """
    Normalizes text by converting it to lowercase, removing leading/trailing spaces.
    If the input is NaN (missing), returns an empty string.
    """
    if pd.isna(text):
        return ''
    return unidecode(str(text).lower().strip())

#### Import data

In [73]:
reference_year = 2016
with open('/mnt/block/data/raw_data/players.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame([{
    'firstName': player.get('firstName'),
    'middleName': player.get('middleName'),
    'lastName': player.get('lastName'),
    'birthDay': player.get('birthDate'),
    'weight': player.get('weight'),
    'height': player.get('height'),
    'position': player.get('role', {}).get('code2'),
    'Id': player.get('wyId'),
    'foot': player.get('foot'),
    'currentTeamId': player.get('currentTeamId'),
    'currentNationalTeamId': player.get('currentNationalTeamId')
} for player in data])

mapping_df    = pd.read_csv("data/player_data/ww.csv")

# Read comma-separated .txt files
valuations_df = pd.read_csv("/mnt/block/data/player_data/player_valuations.txt")
players_df    = pd.read_csv("/mnt/block/data/player_data/players.txt")

In [74]:
print(f"df: {df.columns}")
print(f"df_mapping: {mapping_df.columns}")
print(f"df_valuations: {valuations_df.columns}")
print(f"players_df: {players_df.columns}")

df: Index(['firstName', 'middleName', 'lastName', 'birthDay', 'weight', 'height',
       'position', 'Id', 'foot', 'currentTeamId', 'currentNationalTeamId'],
      dtype='object')
df_mapping: Index(['PlayerFBref', 'UrlFBref', 'UrlTmarkt', 'TMarketID', 'WyScoutID',
       'StatsBombID', 'FBrefID', 'Position', 'Role', 'TransferMarkt Role'],
      dtype='object')
df_valuations: Index(['player_id', 'last_season', 'datetime', 'date', 'dateweek',
       'market_value_in_eur', 'n', 'current_club_id',
       'player_club_domestic_competition_id'],
      dtype='object')
players_df: Index(['player_id', 'first_name', 'last_name', 'name', 'last_season',
       'current_club_id', 'player_code', 'country_of_birth', 'city_of_birth',
       'country_of_citizenship', 'date_of_birth', 'sub_position', 'position',
       'foot', 'height_in_cm', 'market_value_in_eur',
       'highest_market_value_in_eur', 'contract_expiration_date', 'agent_name',
       'image_url', 'url', 'current_club_domestic_competitio

#### Merge Data

In [75]:
df = df.merge(mapping_df, left_on = 'Id', right_on = 'WyScoutID', how = 'left') # merge: df with mapping_df
df = df.merge(players_df, left_on = 'TMarketID', right_on = 'player_id', how = 'left', suffixes = ('', '_player')) # merge: result with players_df
df = df.merge(df_team, left_on = 'currentTeamId', right_on = 'id', how = 'left').rename(columns = {'name_y': 'club', 'id': 'team_Id'}) # merge: result with df_team --> getting team name
df.drop(columns = ['team_Id'], inplace = True) # Drop extra 'Id' column from merge (keep only player Id)
df = df.merge(df_team, left_on = 'currentNationalTeamId', right_on = 'id', how = 'left').rename(columns = {'name': 'national_team', 'id': 'nat_team_Id'}) # merge: result with df_team --> getting country name
df.drop(columns = ['nat_team_Id'], inplace = True)

column_order = ['Id', 'firstName', 'middleName', 'lastName',
                'birthDay', 'weight', 'height', 'foot',
                'club', 'currentTeamId', 'national_team', 'currentNationalTeamId', 'country_of_birth', 'city_of_birth', 'country_of_citizenship',
                'position', 'Role', 'TransferMarkt Role',
                'last_season', 'market_value_in_eur', 'highest_market_value_in_eur', 'contract_expiration_date', 'agent_name',
                'UrlFBref', 'image_url', 'UrlTmarkt', 'TMarketID', 'WyScoutID', 'StatsBombID', 'FBrefID']
df           = df[column_order]

Index(['firstName', 'middleName', 'lastName', 'birthDay', 'weight', 'height',
       'position', 'Id', 'foot', 'currentTeamId', 'currentNationalTeamId',
       'PlayerFBref', 'UrlFBref', 'UrlTmarkt', 'TMarketID', 'WyScoutID',
       'StatsBombID', 'FBrefID', 'Position', 'Role', 'TransferMarkt Role',
       'player_id', 'first_name', 'last_name', 'name_x', 'last_season',
       'current_club_id', 'player_code', 'country_of_birth', 'city_of_birth',
       'country_of_citizenship', 'date_of_birth', 'sub_position',
       'position_player', 'foot_player', 'height_in_cm', 'market_value_in_eur',
       'highest_market_value_in_eur', 'contract_expiration_date', 'agent_name',
       'image_url', 'url', 'current_club_domestic_competition_id',
       'current_club_name', 'club', 'type_x', 'country_name_x', 'country_id_x',
       'city_x', 'national_team', 'type_y', 'country_name_y', 'country_id_y',
       'city_y'],
      dtype='object')


#### Transformations

In [77]:
# --- Normalize df ---
df['firstName']             = df['firstName'].apply(decode_unicode).apply(normalize_unicode)
df['lastName']              = df['lastName'].apply(decode_unicode).apply(normalize_unicode)
df['full_name']             = (df['firstName'] + ' ' + df['lastName']).str.lower()
df['foot']                  = df['foot'].apply(normalize_unicode).str.lower().fillna('unknown')
df['height']                = df['height'].astype(str).str.strip()
df['weight']                = df['weight'].astype(str).str.strip()
df['birthDay']              = pd.to_datetime(df['birthDay'], errors = 'coerce')
df['age']                   = reference_year - df['birthDay'].dt.year
df['birthDay']              = pd.to_datetime(df['birthDay'], errors = 'coerce').dt.strftime('%Y-%m-%d')
df['currentTeamId']         = pd.to_numeric(df['currentTeamId'].replace('null', np.nan), errors = 'coerce')
df['club']                  = df['club'].apply(decode_unicode).apply(normalize_unicode)
df['currentNationalTeamId'] = pd.to_numeric(df['currentNationalTeamId'].replace('null', np.nan), errors = 'coerce')
df['national_team']         = df['national_team'].apply(decode_unicode).apply(normalize_unicode)
df['status']                = ['Retired' if last_season < reference_year else 'Active' for last_season in df['last_season']]
df['Role']                  = df['Role'].fillna(df['position'])

In [78]:
print(df.nunique())
print("-" * 50)
df.shape

Id                             3603
firstName                      2002
middleName                        1
lastName                       3403
birthDay                       2728
weight                           47
height                           42
foot                              5
club                             99
currentTeamId                   614
national_team                    45
currentNationalTeamId           156
country_of_birth                113
city_of_birth                  1805
country_of_citizenship          114
position                          4
Role                             16
TransferMarkt Role               13
last_season                      11
market_value_in_eur             105
highest_market_value_in_eur     135
contract_expiration_date         27
agent_name                      663
UrlFBref                       3143
image_url                      3061
UrlTmarkt                      3128
TMarketID                      3128
WyScoutID                   

(3618, 33)

In [79]:
column_order = ['Id', 'firstName', 'middleName', 'lastName', 'full_name',
                'birthDay', 'age', 'weight', 'height', 'foot',
                'club', 'currentTeamId', 'national_team', 'currentNationalTeamId', 'country_of_birth', 'city_of_birth', 'country_of_citizenship',
                'position', 'Role',
                'status', 'market_value_in_eur', 'highest_market_value_in_eur', 'contract_expiration_date', 'agent_name',
                'UrlFBref', 'image_url', 'UrlTmarkt', 'TMarketID', 'WyScoutID', 'StatsBombID', 'FBrefID']

df           = df[column_order]

#### Save

In [81]:
df.to_parquet("/mnt/block/data/final_datasets/players.parquet", index = False)
df.iloc[0:50, list(range(0, 4)) + list(range(10, 22))]

Unnamed: 0,Id,firstName,middleName,lastName,club,currentTeamId,national_team,currentNationalTeamId,country_of_birth,city_of_birth,country_of_citizenship,position,Role,status,market_value_in_eur,highest_market_value_in_eur
0,32777,harun,,tekin,,4502.0,turkey,4687.0,,,,GK,GK,Active,,
1,393228,malang,,sarr,o.g.c. nice cote d'azur,3775.0,,4423.0,France,Nice,France,DF,CB,Active,8000000.0,18000000.0
2,393230,over,,mandanda,fc girondins de bordeaux,3772.0,,,France,Évreux,France,GK,GK,Active,150000.0,250000.0
3,32793,alfred john momar,,n'diaye,malaga club de futbol,683.0,senegal,19314.0,France,Paris,Senegal,MD,DM,Active,1200000.0,8000000.0
4,393247,ibrahima,,konate,rasen ballsport leipzig,2975.0,,,France,Paris,France,DF,CB,Active,35000000.0,45000000.0
5,33,jasper,,cillessen,fc barcelona,676.0,,664.0,Netherlands,Nijmegen,Netherlands,GK,GK,Active,3000000.0,20000000.0
6,36,toby,,alderweireld,tottenham hotspur fc,1624.0,belgium,5629.0,Belgium,Wilrijk,Belgium,DF,CB,Active,4000000.0,40000000.0
7,48,jan,,vertonghen,tottenham hotspur fc,1624.0,belgium,5629.0,Belgium,Sint-Niklaas,Belgium,DF,CB,Active,1800000.0,32000000.0
8,229427,alexander,,djiku,stade malherbe caen,3783.0,,,France,Montpellier,Ghana,DF,CB,Active,8000000.0,10000000.0
9,54,christian,,dannemann eriksen,tottenham hotspur fc,1624.0,denmark,7712.0,Denmark,Middelfart,Denmark,MD,AM,Active,25000000.0,100000000.0


#### QA

In [40]:
df[df.duplicated(subset = 'Id', keep = False)].iloc[:, list(range(0, 4)) + list(range(25, 34))]

Unnamed: 0,Id,firstName,middleName,lastName,UrlFBref,image_url,UrlTmarkt,TMarketID,WyScoutID,StatsBombID,FBrefID,full_name,age
21,118,memphis,,depay,https://fbref.com/en/players/8f696594/Memphis,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/memphis-depay/pr...,167850.0,118.0,2988.0,8f696594,memphis depay,22
22,118,memphis,,depay,https://fbref.com/en/players/8f696594/Memphis-...,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/memphis-depay/pr...,167850.0,118.0,2988.0,8f696594,memphis depay,22
445,70136,santiago,,arias naranjo,https://fbref.com/en/players/9d95e065/Jhon-Duq...,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/santiago-arias/p...,120443.0,70136.0,5696.0,9d95e065,santiago arias naranjo,24
446,70136,santiago,,arias naranjo,https://fbref.com/en/players/fdd60087/Santiago...,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/santiago-arias/p...,120443.0,70136.0,5696.0,fdd60087,santiago arias naranjo,24
700,263490,sehrou,,guirassy,https://fbref.com/en/players/923f4dda/Sehrou-G...,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/serhou-guirassy/...,270541.0,263490.0,9102.0,923f4dda,sehrou guirassy,20
701,263490,sehrou,,guirassy,https://fbref.com/en/players/923f4dda/Serhou-G...,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/serhou-guirassy/...,270541.0,263490.0,9102.0,923f4dda,sehrou guirassy,20
768,143566,douglas,,dos santos justino de melo,https://fbref.com/en/players/c50e5bba/Douglas-...,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/douglas-santos/p...,220793.0,143566.0,,c50e5bba,douglas dos santos justino de melo,22
769,143566,douglas,,dos santos justino de melo,https://fbref.com/en/players/92b4758e/Samir-Sa...,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/douglas-santos/p...,220793.0,143566.0,,92b4758e,douglas dos santos justino de melo,22
1246,344140,amath,,diedhiou ndiaye,https://fbref.com/en/players/f3b1d9a2/Amath,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/amath-ndiaye/pro...,339820.0,344140.0,6621.0,f3b1d9a2,amath diedhiou ndiaye,20
1247,344140,amath,,diedhiou ndiaye,https://fbref.com/en/players/bcfc4473/Loum-Ndiaye,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.com/amath-ndiaye/pro...,339820.0,344140.0,6621.0,bcfc4473,amath diedhiou ndiaye,20


### Player detail

In [31]:
# Load the JSON data
with open('/mnt/block/data/raw_data/playerank.json', 'r') as f:
    data = json.load(f)

# Convert the data to a DataFrame
df = pd.DataFrame(data)

# Split the 'roleCluster' column into two columns ('primaryRole' and 'secondaryRole')
df[['primaryRole', 'secondaryRole']] = df['roleCluster'].str.split('-', n = 1, expand = True)
df[['secondaryRole', 'thirdRole']]   = df['secondaryRole'].str.split('-', n = 1, expand = True)

# For rows that don't have a secondary role (no hyphen), set 'secondaryRole' to empty string
df['secondaryRole'] = df['secondaryRole'].fillna('')
df['thirdRole'] = df['thirdRole'].fillna('')

df.iloc[0:10]

Unnamed: 0,goalScored,playerankScore,matchId,playerId,roleCluster,minutesPlayed,primaryRole,secondaryRole,thirdRole
0,0,0.0053,2057991,10014,right CB,90,right CB,,
1,0,0.0009,2057992,10014,right CB,41,right CB,,
2,0,-0.0013,2057998,100140,central MF,90,central MF,,
3,0,0.0031,2058000,100140,left CB,90,left CB,,
4,0,0.0035,2499869,10108,right CB,90,right CB,,
5,0,0.0071,2499879,10108,right CB,90,right CB,,
6,0,0.0109,2499901,10108,right CB,90,right CB,,
7,0,0.0224,2499914,10108,right CB,90,right CB,,
8,0,0.0082,2499922,10108,right CB,90,right CB,,
9,0,0.0177,2499964,10108,right CB,90,right CB,,


In [38]:
print(len(df['playerId'].unique()))

2719
