In [None]:
# Testing the statisitcal significance of squad transfer value on match outcomes at Copa America and Euro 2024

In [178]:
# Load libraries
import numpy as np 
from statsbombpy import sb
from mplsoccer import Pitch, Sbopen
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm

In [144]:
# Find 2024 Euros and Copa America competition_ids
competitions = sb.competitions()
competitions = competitions[(competitions['season_name'] == '2024')]
competitions

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
21,223,282,South America,Copa America,male,False,True,2024,2024-07-15T18:00:33.653673,,,2024-07-15T18:00:33.653673
68,55,282,Europe,UEFA Euro,male,False,True,2024,2024-07-15T15:48:50.315500,2024-07-15T15:52:24.778809,2024-07-15T15:52:24.778809,2024-07-15T15:48:50.315500


In [146]:
# Fetch matches for Euro 2024 
df_euros = sb.matches(competition_id=55, season_id=282)

# Fetch matches for Copa America 2024
df_copa_america = sb.matches(competition_id=223, season_id=282)

# Combine the two DataFrames and remove unnecessary variables 
df_combined = pd.concat([df_euros, df_copa_america], ignore_index=True)
df = df_combined[['match_id', 'home_team', 'away_team', 'competition_stage']]

In [148]:
# List of match IDs 
match_ids = df['match_id'].unique().tolist()

# Initialize the parser
parser = Sbopen()

# Initialize an empty list to store DataFrames with player names
all_dfs = []

# Loop through the match IDs and fetch the event data
for match_id in match_ids:
    try:
        # Fetch the event data for each match ID
        df, related, freeze, tactics = parser.event(match_id)
        
        # Append the results to the list
        all_dfs.append(df)
    except Exception as e:
        print(f"Error retrieving data for match {match_id}: {e}")

# Concatenate all the DataFrames into one final DataFrame
df_matches = pd.concat(all_dfs, ignore_index=True)

# Remove duplicates and create a list of all unique player names
players = df_matches['player_name'].dropna().unique().tolist()

# Count the number of player names in the list
print(f"Total players found: {len(players)}")


Total players found: 835


In [33]:
# Scrape transfer values from transfermarkt.com

# Base URL for player search
base_url = "https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query="

# DataFrame to store results
results = []


# Iterate through the player list with a counter
for i, player in enumerate(players, start=1):
    # Ensure player is a string and handle NaN values
    if isinstance(player, str):  # Check if the player is a string
        # Format the player name for the URL
        search_url = base_url + player.replace(" ", "+")
        # Print the stage of the scraping process and the players which have been scraped
        print(f"{i}/{len(players)}: {player}")
    else:
        print(f"Skipping non-string value: {player}")

    try:
        # Fetch the page
        response = requests.get(search_url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract the market value
        market_value_tag = soup.find("td", class_="rechts hauptlink")  # Target the specific class
        if market_value_tag:
            market_value = market_value_tag.text.strip()
        else:
            market_value = "Not Found"

        # Append the result
        results.append({"player": player, "transfer value": market_value})
    except Exception as e:
        results.append({"player": player, "transfer value": "Error: " + str(e)})

    # Pause to avoid being blocked by the website
    time.sleep(2)

# Convert results to DataFrame
df_player_values = pd.DataFrame(results)

# Display results
print(df_player_values)

Skipping non-string value: nan
2/836: Kobbie Mainoo
3/836: Jordan Pickford
4/836: Memphis Depay
5/836: Virgil van Dijk
6/836: John Stones
7/836: Harry Kane
8/836: Tijjani Reijnders
9/836: Cody Mathès Gakpo
10/836: Kyle Walker
11/836: Nathan Aké
12/836: Stefan de Vrij
13/836: Jerdy Schouten
14/836: Denzel Dumfries
15/836: Donyell Malen
16/836: Marc Guehi
17/836: Declan Rice
18/836: Jude Bellingham
19/836: Kieran Trippier
20/836: Phil Foden
21/836: Bukayo Saka
22/836: Xavi Simons
23/836: Bart Verbruggen
24/836: Joey Veerman
25/836: Luke Shaw
26/836: Wout Weghorst
27/836: Ollie Watkins
28/836: Cole Palmer
29/836: Joshua Zirkzee
30/836: Brian Brobbey
31/836: Conor Gallagher
32/836: Unai Simón Mendibil
33/836: Robin Aime Robert Le Normand
34/836: Daniel Carvajal Ramos
35/836: Daniel Olmo Carvajal
36/836: Álvaro Borja Morata Martín
37/836: Lamine Yamal Nasraoui Ebana
38/836: Rodrigo Hernández Cascante
39/836: Aymeric Laporte
40/836: Nicholas Williams Arthuer
41/836: Marc Cucurella Saseta
42/

In [172]:
# Add team info to player transfer values

# Merge df_players with df_player_values on player_name and player
df_players_team_value = df_players.merge(df_player_values[['player', 'transfer value']], 
                           left_on='player_name', right_on='player', 
                           how='left')

# Drop unnecessary columns (like 'player' from df_players_team_value)
df_players_team_value = df_players_team_value.drop(columns=['player'])

# View the result
df_players_team_value.head()

Unnamed: 0,player_name,team_name,transfer value
0,Kobbie Mainoo,England,€55.00m
1,Jordan Pickford,England,€20.00m
2,Memphis Depay,Netherlands,€10.00m
3,Virgil van Dijk,Netherlands,€28.00m
4,John Stones,England,€30.00m


In [176]:
# Function to convert the transfer value to numeric values
def convert_transfer_value(value):
    if isinstance(value, str):  # Check if the value is a string
        if value == 'Not Found':  # Handle 'Not Found' cases or any invalid strings
            return np.nan
        
        # Remove the euro symbol (€)
        value = value.replace('€', '').strip()
        
        # Handle 'k' (thousand) and 'm' (million) suffixes
        if 'k' in value:
            return float(value.replace('k', '').strip()) * 1000
        elif 'm' in value:
            return float(value.replace('m', '').strip()) * 1000000
        else:
            # If no suffix, just convert the value to float
            try:
                return float(value)
            except ValueError:
                # If conversion fails, return NaN
                return np.nan
    else:
        # If the value is not a string (e.g., NaN), return NaN
        return np.nan

# Apply the conversion function to the 'transfer value' column
df_players_team_value['transfer value'] = df_players_team_value['transfer value'].apply(convert_transfer_value)

# Display the updated dataframe
print(df_players_team_value)

                             player_name    team_name  transfer value
0                          Kobbie Mainoo      England      55000000.0
1                        Jordan Pickford      England      20000000.0
2                          Memphis Depay  Netherlands      10000000.0
3                        Virgil van Dijk  Netherlands      28000000.0
4                            John Stones      England      30000000.0
..                                   ...          ...             ...
830  Luis Jan Piers Advíncula Castrillón         Peru        300000.0
831       Diego Alfonso Valdés Contreras        Chile       4000000.0
832         Joao Alberto Grimaldo Ubidia         Peru       1000000.0
833          Luis Alfonso Abram Ugarelli         Peru       2000000.0
834                   Jacen Russell-Rowe       Canada       2500000.0

[835 rows x 3 columns]


In [186]:
# Convert 'transfer value' to numeric
df_players_team_value['transfer value'] = pd.to_numeric(df_players_team_value['transfer value'], errors='coerce')

# Group by team_name and calculate the mean transfer value for each team
df_team_value = df_players_team_value.groupby('team_name')['transfer value'].mean().reset_index().sort_values(by='transfer value', ascending=False)

df_team_value

Unnamed: 0,team_name,transfer value
14,England,64761900.0
15,France,52735290.0
5,Brazil,51638890.0
33,Spain,49068180.0
27,Portugal,40761900.0
1,Argentina,33976190.0
17,Germany,33409090.0
22,Netherlands,30775000.0
3,Belgium,28147060.0
19,Italy,28095240.0


In [188]:
# Create binary variables for home_win and away_win
df_combined['home_win'] = (df_combined['home_score'] > df_combined['away_score']).astype(int)
df_combined['away_win'] = (df_combined['away_score'] > df_combined['home_score']).astype(int)
df_combined['goal_difference'] = (df_combined['home_score'] - df_combined['away_score']).astype(int)

# Display the updated DataFrame
df_matches2 = (df_combined[['match_id','home_team', 'away_team','home_score', 'away_score', 'home_win', 'away_win','goal_difference']])
df_matches2.head(5)

Unnamed: 0,match_id,home_team,away_team,home_score,away_score,home_win,away_win,goal_difference
0,3942819,Netherlands,England,1,2,0,1,-1
1,3943043,Spain,England,2,1,1,0,1
2,3942752,Spain,France,2,1,1,0,1
3,3942382,Netherlands,Turkey,2,1,1,0,1
4,3942349,Portugal,France,0,0,0,0,0


In [190]:
# Merge df_team_value with df_matches2 to get home_value
df_home = df_matches2.merge(df_team_value[['team_name', 'transfer value']], left_on='home_team', right_on='team_name', how='left')
df_home = df_home.rename(columns={'transfer value': 'transfer value home'}).drop(columns='team_name')


# Merge again to get away_value
df_final = df_home.merge(df_team_value[['team_name', 'transfer value']], left_on='away_team', right_on='team_name', how='left')
df_final = df_final.rename(columns={'transfer value': 'transfer value away'}).drop(columns='team_name')

# The resulting df_final now contains home_value and away_value columns
df_final.head()

Unnamed: 0,match_id,home_team,away_team,home_score,away_score,home_win,away_win,goal_difference,transfer value home,transfer value away
0,3942819,Netherlands,England,1,2,0,1,-1,30775000.0,64761900.0
1,3943043,Spain,England,2,1,1,0,1,49068180.0,64761900.0
2,3942752,Spain,France,2,1,1,0,1,49068180.0,52735290.0
3,3942382,Netherlands,Turkey,2,1,1,0,1,30775000.0,13856000.0
4,3942349,Portugal,France,0,0,0,0,0,40761900.0,52735290.0


In [192]:
# Calculate match outcome as a categorical variables (home, draw or away)
df_final['outcome'] = df_final.apply(lambda row: 'home' if row['home_score'] > row['away_score'] 
                         else ('away' if row['away_score'] > row['home_score'] else 'draw'), axis=1)
# Calculate the transfer value of the home team in relation to the away team
df_final['home relative transfer value'] = df_final['transfer value home'] / df_final['transfer value away']

# Display the final full df with all team, score and transfer values. 
df_final

#Save to csv ahead of statistical testing df_final.to_csv('df_final.csv', index=False)

Unnamed: 0,match_id,home_team,away_team,home_score,away_score,home_win,away_win,goal_difference,transfer value home,transfer value away,outcome,home relative transfer value
0,3942819,Netherlands,England,1,2,0,1,-1,3.077500e+07,6.476190e+07,away,0.475202
1,3943043,Spain,England,2,1,1,0,1,4.906818e+07,6.476190e+07,home,0.757670
2,3942752,Spain,France,2,1,1,0,1,4.906818e+07,5.273529e+07,home,0.930462
3,3942382,Netherlands,Turkey,2,1,1,0,1,3.077500e+07,1.385600e+07,home,2.221059
4,3942349,Portugal,France,0,0,0,0,0,4.076190e+07,5.273529e+07,draw,0.772953
...,...,...,...,...,...,...,...,...,...,...,...,...
78,3939974,United States,Bolivia,2,0,1,0,2,1.642857e+07,6.357143e+05,home,25.842697
79,3939972,Ecuador,Venezuela,1,2,0,1,-1,1.242222e+07,3.427273e+06,away,3.624521
80,3939971,Mexico,Jamaica,1,0,1,0,1,8.821053e+06,1.856944e+06,home,4.750305
81,3939970,Peru,Chile,0,0,0,0,0,1.184783e+06,3.352632e+06,draw,0.353389


In [194]:
# Testing for statistical significance

# ANOVA method for testing more than two categories (home win, draw, away win)
df_stage = df_final[df_final['outcome'].isin(['home', 'draw', 'away'])]
home_values = df_stage[df_stage['outcome'] == 'home']['home relative transfer value']
draw_values = df_stage[df_stage['outcome'] == 'draw']['home relative transfer value']
away_values = df_stage[df_stage['outcome'] == 'away']['home relative transfer value']

# Perform ANOVA
f_stat, p_value = stats.f_oneway(home_values, draw_values, away_values)

# Output the result
print(f"F-statistic: {f_stat}, P-value: {p_value}")

# If p-value < 0.05, reject the null hypothesis (there's a significant difference)

F-statistic: 4.731433428244266, P-value: 0.01142581097495298


In [196]:
# T-test for testing two categories home win vs away win (excluding draws)
df_win_loss = df_final[df_final['outcome'].isin(['home', 'away'])]
home_values = df_win_loss[df_win_loss['outcome'] == 'home']['home relative transfer value']
away_values = df_win_loss[df_win_loss['outcome'] == 'away']['home relative transfer value']

# Perform a T-test to compare the two groups
t_stat, p_value = stats.ttest_ind(home_values, away_values)

# Output the result
print(f"T-statistic: {t_stat}, P-value: {p_value}")

# If p-value < 0.05, reject the null hypothesis (significant difference)

T-statistic: 2.8896015933532277, P-value: 0.005509536644556829


In [200]:
# Logistical regression uses the binary value hoem_win to estimates the probability of a team winning (1) or losing (0)

# Define the independent variable (Home Relative Value) and dependent variable (home_win)
X = df_final[df_final['outcome'].isin(['home', 'away'])][['home relative transfer value']]  # Independent variable
X = sm.add_constant(X)  # Add constant term (intercept)
y = df_final[df_final['outcome'].isin(['home', 'away'])]['home_win']  # Dependent variable

# Perform logistic regression
model = sm.Logit(y, X)
result = model.fit()

# Display the results
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.489442
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               home_win   No. Observations:                   57
Model:                          Logit   Df Residuals:                       55
Method:                           MLE   Df Model:                            1
Date:                Sat, 05 Apr 2025   Pseudo R-squ.:                  0.2743
Time:                        18:08:16   Log-Likelihood:                -27.898
converged:                       True   LL-Null:                       -38.441
Covariance Type:            nonrobust   LLR p-value:                 4.391e-06
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const                           -0.9725      0.446     -2.180      0.029

In [202]:
# Pearson correlation between squad value and goal difference with goal difference as the dependent variable
correlation, p_value = stats.pearsonr(df_final['home relative transfer value'], df_final['goal_difference'])

# Output the result
print(f"Pearson correlation: {correlation}, p-value: {p_value}")

# If p-value < 0.05, reject the null hypothesis (there's a significant correlation)

Pearson correlation: 0.4393154609926181, p-value: 3.253260352338878e-05
