**Set up modules/data**

In [52]:
import numpy as np
import pandas as pd
import math
pd.options.mode.copy_on_write = True
import random
from matplotlib.pyplot import subplots
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize)

Load data set

In [53]:
UCL_key = pd.read_csv('../Data/key_stats.csv')

UCL_key

Unnamed: 0,player_name,club,position,minutes_played,match_played,goals,assists,distance_covered
0,Courtois,Real Madrid,Goalkeeper,1230,13,0,0,64.2
1,Vinícius Júnior,Real Madrid,Forward,1199,13,4,6,133.0
2,Benzema,Real Madrid,Forward,1106,12,15,1,121.5
3,Modrić,Real Madrid,Midfielder,1077,13,0,4,124.5
4,Éder Militão,Real Madrid,Defender,1076,12,0,0,110.4
...,...,...,...,...,...,...,...,...
742,Gil Dias,Benfica,Midfielder,1,1,0,0,0.7
743,Rodrigo Ribeiro,Sporting CP,Forward,1,1,0,0,0.7
744,Cojocari,Sheriff,Defender,1,1,0,0,0.5
745,Maouassa,Club Brugge,Defender,1,1,0,0,0.2


In [54]:
import pandas as pd
from functools import reduce

# --- Load all the CSVs ---
key_stats    = pd.read_csv('../Data/key_stats.csv')      # base
disciplinary = pd.read_csv('../Data/disciplinary.csv')
attacking    = pd.read_csv('../Data/attacking.csv')
attempts     = pd.read_csv('../Data/attempts.csv')
defending    = pd.read_csv('../Data/defending.csv')
distribution = pd.read_csv('../Data/distributon.csv')
goals        = pd.read_csv('../Data/goals.csv')          # skip goalkeeping.csv

# --- Helper: add a prefix to ALL non-key columns so names are unique ---
def prefix_cols(df, prefix):
    return df.rename(columns={
        col: f'{prefix}_{col}' for col in df.columns if col != 'player_name'
    })

# Give each dataframe its own column namespace
key_stats_p    = prefix_cols(key_stats,    'ks')
disciplinary_p = prefix_cols(disciplinary, 'disc')
attacking_p    = prefix_cols(attacking,    'att')
attempts_p     = prefix_cols(attempts,     'attmpt')
defending_p    = prefix_cols(defending,    'def')
distribution_p = prefix_cols(distribution, 'dist')
goals_p        = prefix_cols(goals,        'goal')

# --- Put them into a list ---
datasets = [
    key_stats_p,
    disciplinary_p,
    attacking_p,
    attempts_p,
    defending_p,
    distribution_p,
    goals_p,
]

# --- Merge everything on player_name using outer join ---
mega_UCL = reduce(
    lambda left, right: pd.merge(left, right, on='player_name', how='outer'),
    datasets
)

# Optional: fill missing values
mega_UCL = mega_UCL.fillna(0)
mega_UCL

Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
0,Aaronson,Salzburg,Midfielder,715.0,8.0,0.0,2.0,103.5,326.0,Salzburg,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Abubakari,Malmö,Forward,116.0,4.0,0.0,0.0,15.3,146.0,Malmö,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Acuña,Sevilla,Defender,379.0,5.0,0.0,0.0,43.2,31.0,Sevilla,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Adams,Leipzig,Midfielder,292.0,5.0,0.0,0.0,38.9,146.0,Leipzig,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Adamu,Salzburg,Forward,231.0,8.0,1.0,0.0,32.6,55.0,Salzburg,...,Forward,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,Óscar Rodríguez,Sevilla,Midfielder,22.0,1.0,0.0,0.0,3.8,0.0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
999,Čolak,Malmö,Forward,500.0,6.0,0.0,0.0,63,40.0,Malmö,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,Šeško,Salzburg,Forward,234.0,6.0,0.0,0.0,32.1,263.0,Salzburg,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001,Šimić,Salzburg,Forward,4.0,1.0,0.0,0.0,1.3,0.0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Logistic Regression:**

In [55]:
mega_UCL_nodup = mega_UCL.drop_duplicates(subset=['player_name', 'ks_club']) #gets rid of dupelicate rows by looking at name and club

mega_UCL_nodup


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
0,Aaronson,Salzburg,Midfielder,715.0,8.0,0.0,2.0,103.5,326.0,Salzburg,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Abubakari,Malmö,Forward,116.0,4.0,0.0,0.0,15.3,146.0,Malmö,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Acuña,Sevilla,Defender,379.0,5.0,0.0,0.0,43.2,31.0,Sevilla,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Adams,Leipzig,Midfielder,292.0,5.0,0.0,0.0,38.9,146.0,Leipzig,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Adamu,Salzburg,Forward,231.0,8.0,1.0,0.0,32.6,55.0,Salzburg,...,Forward,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,Óscar Rodríguez,Sevilla,Midfielder,22.0,1.0,0.0,0.0,3.8,0.0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
999,Čolak,Malmö,Forward,500.0,6.0,0.0,0.0,63,40.0,Malmö,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,Šeško,Salzburg,Forward,234.0,6.0,0.0,0.0,32.1,263.0,Salzburg,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001,Šimić,Salzburg,Forward,4.0,1.0,0.0,0.0,1.3,0.0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
mega_UCL_nodup[mega_UCL_nodup['player_name'].duplicated()]['player_name'].unique() #shows players with same last name

array(['Camara', 'Correa', 'Danilo', 'Diallo', 'Fernando', 'Henderson',
       'Herrera', 'João Mário', 'Karavaev', 'Martínez', 'Mendy', 'Onana',
       'Peña', 'Sarr', 'Steffen'], dtype=object)

In [57]:
Dupes = mega_UCL_nodup[mega_UCL_nodup['player_name'].duplicated()]['player_name'].unique()  #shows rows of players with same last name

for name in Dupes:
    print("\n=== Rows for:", name, "===")
    display(mega_UCL_nodup[mega_UCL_nodup['player_name'] == name])


=== Rows for: Camara ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
95,Camara,Salzburg,Midfielder,672.0,8.0,0.0,0.0,80.9,20.0,Salzburg,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111,Camara,Young Boys,Defender,347.0,4.0,0.0,0.0,30.5,20.0,Salzburg,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



=== Rows for: Correa ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
156,Correa,Atlético,Forward,401.0,10.0,1.0,0.0,49.6,55.0,Atlético,...,Forward,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0
164,Correa,Inter,Forward,106.0,5.0,0.0,0.0,14.8,55.0,Atlético,...,Forward,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0



=== Rows for: Danilo ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
182,Danilo,Juventus,Defender,450.0,5.0,0.0,1.0,49.9,146.0,Paris,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,Danilo,Paris,Midfielder,362.0,7.0,0.0,0.0,40.6,146.0,Paris,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



=== Rows for: Diallo ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
239,Diallo,Paris,Defender,165.0,2.0,0.0,0.0,17.9,263.0,Paris,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,Diallo,Man. United,Midfielder,68.0,1.0,0.0,0.0,8.3,263.0,Paris,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



=== Rows for: Fernando ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
293,Fernando,Sevilla,Midfielder,540.0,6.0,0.0,0.0,66.6,146.0,Sevilla,...,Midfielder,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,5.0
301,Fernando,Shakhtar Donetsk,Midfielder,350.0,5.0,2.0,0.0,34.8,146.0,Sevilla,...,Midfielder,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,5.0



=== Rows for: Henderson ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
370,Henderson,Liverpool,Midfielder,666.0,12.0,1.0,2.0,85.7,55.0,Liverpool,...,Midfielder,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,12.0
371,Henderson,Man. United,Goalkeeper,68.0,1.0,0.0,0.0,4.4,55.0,Liverpool,...,Midfielder,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,12.0



=== Rows for: Herrera ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
375,Herrera,Paris,Midfielder,326.0,6.0,1.0,0.0,44.3,146.0,Paris,...,Midfielder,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,6.0
391,Herrera,Atlético,Midfielder,226.0,4.0,0.0,0.0,30.1,146.0,Paris,...,Midfielder,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,6.0



=== Rows for: João Mário ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
433,João Mário,Benfica,Midfielder,493.0,8.0,0.0,4.0,66.0,26.0,Benfica,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
441,João Mário,Porto,Forward,351.0,4.0,0.0,0.0,39.3,26.0,Benfica,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



=== Rows for: Karavaev ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
456,Karavaev,Zenit,Defender,299.0,4.0,0.0,1.0,38.3,417.0,Zenit,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
472,Karavaev,Dynamo Kyiv,Defender,154.0,6.0,0.0,0.0,21.6,417.0,Zenit,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



=== Rows for: Martínez ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
586,Martínez,Ajax,Defender,720.0,8.0,0.0,1.0,83.3,107.0,Inter,...,Forward,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0
610,Martínez,Inter,Forward,587.0,8.0,1.0,0.0,67.4,107.0,Inter,...,Forward,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0
634,Martínez,Leipzig,Goalkeeper,90.0,1.0,0.0,0.0,6.0,107.0,Inter,...,Forward,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0



=== Rows for: Mendy ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
678,Mendy,Real Madrid,Defender,867.0,10.0,0.0,2.0,96.3,107.0,Real Madrid,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
680,Mendy,Chelsea,Goalkeeper,840.0,9.0,0.0,0.0,57.4,107.0,Real Madrid,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



=== Rows for: Onana ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
740,Onana,LOSC,Midfielder,257.0,8.0,0.0,0.0,34.9,491.0,LOSC,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
742,Onana,Ajax,Goalkeeper,180.0,2.0,0.0,0.0,8.4,491.0,LOSC,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



=== Rows for: Peña ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
776,Peña,Malmö,Midfielder,281.0,5.0,0.0,0.0,32.5,326.0,Malmö,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
780,Peña,Villarreal,Defender,18.0,2.0,0.0,0.0,3.3,326.0,Malmö,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



=== Rows for: Sarr ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
848,Sarr,Bayern,Defender,125.0,5.0,0.0,0.0,17.3,326.0,Bayern,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
852,Sarr,Chelsea,Defender,100.0,2.0,0.0,0.0,11.9,326.0,Bayern,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



=== Rows for: Steffen ===


Unnamed: 0,player_name,ks_club,ks_position,ks_minutes_played,ks_match_played,ks_goals,ks_assists,ks_distance_covered,disc_serial,disc_club,...,goal_position,goal_goals,goal_right_foot,goal_left_foot,goal_headers,goal_others,goal_inside_area,goal_outside_areas,goal_penalties,goal_match_played
885,Steffen,Wolfsburg,Midfielder,258.0,5.0,2.0,0.0,31.1,201.0,Wolfsburg,...,Midfielder,2.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,5.0
887,Steffen,Man. City,Goalkeeper,90.0,1.0,0.0,0.0,4.0,201.0,Wolfsburg,...,Midfielder,2.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,5.0


Find stastically significant features in data set

Run model to see what gives test data

Run k-fold to test data since less than 1000 observations

**K-means Clustering**

Normalize data

Run k-means, we can discuss what best one is or switch to hierarchial

In [58]:
#I was thinking 4 for 4 positions or we can research how many football archetypes there are

**Random Forest**

Run the random forest

Compare to logistic regression