In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
defending_csv = "cleaned_data/defending_cleaned.csv"
passing_csv = "cleaned_data/passing_cleaned.csv"
standard_csv = "cleaned_data/standard_cleaned.csv"

df_def = pd.read_csv(defending_csv)
df_pass = pd.read_csv(passing_csv)
df_std = pd.read_csv(standard_csv)

In [5]:
print("Defending columns:", df_def.columns.tolist())
print("Passing columns:", df_pass.columns.tolist())
print("Standard columns:", df_std.columns.tolist())

Defending columns: ['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', '90s', 'Tkl', 'TklW', 'Def 3rd', 'Mid 3rd', 'Att 3rd', 'DribTkl', 'Att', 'Tkl%', 'Lost', 'Blocks', 'Sh', 'Pass', 'Int', 'Tkl+Int', 'Clr', 'Err', 'Matches', 'Season', 'Interceptions/90', 'Tackles/90']
Passing columns: ['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', '90s', 'Cmp_Total', 'Att_Total', 'Cmp%_Total', 'TotDist', 'PrgDist', 'Cmp_Short', 'Att_Short', 'Cmp%_Short', 'Cmp_Medium', 'Att_Medium', 'Cmp%_Medium', 'Cmp_Long', 'Att_Long', 'Cmp%_Long', 'Ast', 'xAG', 'xA', 'A-xAG', 'KP', '1/3', 'PPA', 'CrsPA', 'PrgP', 'Matches', 'Season', 'Cmp_Total/90', 'Cmp_Short/90', 'Cmp_Medium/90', 'Cmp_Long/90', 'Att_Total/90', 'Att_Short/90', 'Att_Medium/90', 'Att_Long/90']
Standard columns: ['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR

In [6]:
for df in [df_def, df_pass, df_std]:
    if 'Competition' in df.columns:
        df.rename(columns={'Competition': 'Comp'}, inplace=True)

In [8]:
for df in [df_def, df_pass, df_std]:
    if 'Competition' in df.columns:
        df.rename(columns={'Competition': 'Comp'}, inplace=True)
    if 'Age' in df.columns:
        df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

In [9]:
merge_keys = ['Player', 'Season', 'Nation', 'Pos', 'Squad', 'Comp', 'Age']

# Merge defending + passing
df_merged = pd.merge(df_def, df_pass, on=merge_keys, suffixes=('_def', '_pass'), how='inner')
# Merge with standard
df_merged = pd.merge(df_merged, df_std, on=merge_keys, suffixes=('', '_std'), how='inner')

print("Merged shape:", df_merged.shape)

Merged shape: (19564, 95)


In [10]:
drop_cols = ['Nation', 'Comp', 'Born']  # Add more as needed
df_final = df_merged.drop(columns=[col for col in drop_cols if col in df_merged.columns])

In [11]:
# Identify non-numeric columns to exclude
non_numeric = ['Player', 'Season', 'Pos', 'Squad', 'Age']
feature_cols = [col for col in df_final.columns if col not in non_numeric and pd.api.types.is_numeric_dtype(df_final[col])]

# Final feature matrix for clustering
X = df_final[feature_cols].copy()

In [16]:
# Check for inf/-inf and replace with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Option 1: Drop rows with NaN (if you want only complete cases)
X = X.dropna()

# Option 2: Or fill NaN with 0 or column mean (choose what makes sense for your use case)
# X = X.fillna(0)
# X = X.fillna(X.mean())

# Now scale
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

print(X.isnull().sum())
print(np.isinf(X).sum())

Born_def          0
90s_def           0
Tkl               0
TklW              0
Def 3rd           0
                 ..
xG_per90          0
xAG_per90         0
xG+xAG_per90      0
npxG_per90        0
npxG+xAG_per90    0
Length: 68, dtype: int64
Born_def          0
90s_def           0
Tkl               0
TklW              0
Def 3rd           0
                 ..
xG_per90          0
xAG_per90         0
xG+xAG_per90      0
npxG_per90        0
npxG+xAG_per90    0
Length: 68, dtype: int64


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

In [None]:
def prepare_player_clustering_df(
    defending_csv="cleaned_data/defending_cleaned.csv",
    passing_csv="cleaned_data/passing_cleaned.csv",
    standard_csv="cleaned_data/standard_cleaned.csv",
    drop_cols=None,
    scale_method="zscore"
):
    """
    Loads, merges, and prepares player-level stats for clustering.

    Returns:
        df_final: DataFrame with identifiers and features
        X_scaled: Standardized feature matrix (for clustering)
    """
    # Load
    df_def = pd.read_csv(defending_csv)
    df_pass = pd.read_csv(passing_csv)
    df_std = pd.read_csv(standard_csv)

    # Harmonize column names if needed
    for df in [df_def, df_pass, df_std]:
        if 'Competition' in df.columns:
            df.rename(columns={'Competition': 'Comp'}, inplace=True)

    merge_keys = ['Player', 'Season', 'Nation', 'Pos', 'Squad', 'Comp', 'Age']

    # Merge
    df_merged = pd.merge(df_def, df_pass, on=merge_keys, suffixes=('_def', '_pass'), how='inner')
    df_merged = pd.merge(df_merged, df_std, on=merge_keys, suffixes=('', '_std'), how='inner')

    # Drop unwanted columns
    if drop_cols is None:
        drop_cols = ['Nation', 'Comp', 'Born']
    df_final = df_merged.drop(columns=[col for col in drop_cols if col in df_merged.columns])

    # Identify features
    non_numeric = ['Player', 'Season', 'Pos', 'Squad', 'Age']
    feature_cols = [col for col in df_final.columns if col not in non_numeric and pd.api.types.is_numeric_dtype(df_final[col])]

    # Standardize
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    scaler = StandardScaler() if scale_method == "zscore" else MinMaxScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(df_final[feature_cols]), columns=feature_cols, index=df_final.index)

    return df_final, X_scaled

In [19]:
df_final, X_scaled = prepare_player_clustering_df()

print(df_final[['Player', 'Season', 'Squad', 'Pos']].head())
print(X_scaled.head())

FileNotFoundError: [Errno 2] No such file or directory: 'passing_cleaned.csv'