In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load data from the Excel file
file_path = 'PATH_TO/Player_Clustering_KI-Modell/top5-players24-25.xlsx'
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure (this will be generated when the notebook is run)
df.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90
0,1,Max Aarons,eng ENG,DF,Bournemouth,eng Premier League,24.0,2000.0,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Max Aarons,eng ENG,"DF,MF",Valencia,es La Liga,24.0,2000.0,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.02
2,3,Rodrigo Abajas,es ESP,DF,Valencia,es La Liga,21.0,2003.0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.1,0.1
3,4,James Abankwah,ie IRL,"DF,MF",Udinese,it Serie A,20.0,2004.0,6,0,...,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.06,0.06,0.06
4,5,Keyliane Abdallah,fr FRA,FW,Marseille,fr Ligue 1,18.0,2006.0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Datenvorbereitung

## Werte pro 90 Minuten berechnen

In [2]:
# Zuerst sicherstellen, dass 90s > 0 ist (sonst Division durch 0 vermeiden)
df = df[df["90s"] > 0].copy()

df["Tore_ohne11_per90"] = df["G-PK"] / df["90s"]
df["Assists_per90"] = df["Ast"] / df["90s"]
df["xG_per90"] = df["xG"] / df["90s"]
df["xAG_per90"] = df["xAG"] / df["90s"]
df["Progressive Carries_per90"] = df["PrgC"] / df["90s"]
df["Progressive Passes_per90"] = df["PrgP"] / df["90s"]
df["Progressive Runs_per90"] = df["PrgR"] / df["90s"]

## nur relevante Datensätze und Attribute berücksichtigen

In [3]:
# Nur Spieler mit ausreichend Minuten (hier: 450 Minuten)
df_filtered = df[df["Min"] >= 450].copy()

# Nur ausgewählte numerische Attribute
features = [
    "Tore_ohne11_per90",
    "Assists_per90",
    "xG_per90",
    "xAG_per90",
    "Progressive Carries_per90",
    "Progressive Passes_per90",
    "Progressive Runs_per90"
]

# Mittelwert und Standardabweichung berechnen (für Standardisierung)
means = df_filtered[features].mean()
stds = df_filtered[features].std()

# Ähnliche Spieler finden

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances

# Spielername
spieler_name = "Leroy Sané"

# Spielerwerte extrahieren
player_vals = df_filtered.loc[df_filtered["Player"] == spieler_name, features].iloc[0]

# Stärken berechnen ---
# Stärke = Wie stark der Spieler bei jedem Merkmal vom Durchschnitt abweicht (in Standardabweichungen)
strengths = ((player_vals - means) / stds).abs()

# Gewichtung ableiten 
# Gewichtung proportional zur Stärke + kleiner Offset, damit keine Nullgewichte entstehen
weights = strengths + 0.1
weights = weights / weights.sum()

# Standardisierung der Features
X = df_filtered[features]
X_scaled = (X - means) / stds

# Wende Gewichtung an — Gewichtung als Wurzel nehmen, damit Varianz-Multiplikation Sinn macht
weights_sqrt = np.sqrt(weights.values)
X_weighted = X_scaled * weights_sqrt

# Spielerindex finden
index = df_filtered[df_filtered["Player"] == spieler_name].index[0]
index_loc = df_filtered.index.get_loc(index)

# Berechnung der gewichteten euklidischen Distanzen zum Spieler
dists = euclidean_distances([X_weighted.iloc[index_loc]], X_weighted)[0]

# Wandle Distanz in Similarity um (z.B. similarity = 1 / (1 + dist))
# Distanz in Ähnlichkeit umrechnen: Je kleiner die Distanz, desto größer die Ähnlichkeit.
similarities = 1 / (1 + dists)

df_filtered["similarity"] = similarities

# ähnliche Spieler finden
similar_players = df_filtered[df_filtered["Player"] != spieler_name].sort_values(by="similarity", ascending=False)

# Ausgabe des Spielers + Top 10 ähnliche Spieler
vergleich = pd.concat([
    df_filtered.loc[df_filtered["Player"] == spieler_name, ["Player", "Pos"] + features + ["similarity"]],
    similar_players[["Player", "Pos"] + features + ["similarity"]].head(10)
])
vergleich.reset_index(drop=True, inplace=True)

#Ergebnis anzeigen
display(vergleich.round(2))

Unnamed: 0,Player,Pos,Tore_ohne11_per90,Assists_per90,xG_per90,xAG_per90,Progressive Carries_per90,Progressive Passes_per90,Progressive Runs_per90,similarity
0,Leroy Sané,FW,0.6,0.27,0.56,0.3,4.12,3.52,14.62,1.0
1,Ademola Lookman,"MF,FW",0.56,0.2,0.41,0.28,5.68,2.92,12.64,0.58
2,Serge Gnabry,"FW,MF",0.51,0.36,0.54,0.37,3.19,3.48,10.8,0.55
3,Bradley Barcola,FW,0.58,0.41,0.55,0.42,5.66,3.97,15.45,0.55
4,Raphinha,"FW,MF",0.51,0.29,0.61,0.4,2.98,4.29,11.08,0.54
5,Mohamed Salah,FW,0.53,0.48,0.67,0.38,4.11,3.84,13.01,0.53
6,Luis Díaz,FW,0.49,0.19,0.45,0.19,4.04,4.12,10.97,0.52
7,Cody Gakpo,FW,0.47,0.19,0.33,0.2,2.74,2.42,12.93,0.49
8,Kingsley Coman,FW,0.35,0.28,0.3,0.23,4.65,3.17,15.14,0.48
9,Kylian Mbappé,FW,0.74,0.09,0.8,0.24,4.64,4.33,11.95,0.47
