In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.impute import KNNImputer

In [None]:
#goal: find the best replacement for Mohammed Salah, star player for Liverpool FC, whose contract expires at the end of the current season (2025). We attempt to use data-driven methods to find a best possible replacement
#want to be able to reduce the data to manageable dimenssions, cluster the data, and find nearest neighbors closest to the Mohammed Salah datapoint. Then, within the cluster,
#we can use KNN regression to predict which player is going to have the largest number of xG, the target variable we want to focus on once the data is clustered. We want to first focus on players with similar playing styles, hence the first step of clustering.
#then out of the similar players with similar playing styles, we want to choose the best possible replacement, from a quality standpoint. xG is traditionally the variable that correlates most closely with goals scored and team performance.
#so we want to use xG as our barometer metric of quality (how good a player is)
#goal is to find the nearest neighbor with the highest xG (a metric that is predictive of a player's ability to score goals)

In [None]:
#loading the csv in
df = pd.read_csv('event_data_final.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'event_data_final.csv'

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# **DATA CLEANING**

We used a dataset from FBref. We first search for any players/rows with missing data. To fill in that missing data, we use K-Nearest Neighbors imputation. We also converted all the columns into float values.

In [None]:
#getting rid of nas/or any missing values--> use knn regression to fill in missing values

In [None]:
rows_with_missing_values = df[df.isnull().any(axis=1)]

rows_with_missing_values

NameError: name 'df' is not defined

In [None]:
df['Min']

for index, value in df['Min'].items():
    if isinstance(value, str) and ',' in value:
        new = value.replace(',',"")

        df.at[index, 'Min'] = new

df['Min'] = df['Min'].astype(float)

df['Min']

In [None]:
numerical_cols = df.select_dtypes(include=np.number).columns

In [None]:
imputer = KNNImputer(n_neighbors=5)
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

rows_with_missing_values_after_imputation = df[df.isnull().any(axis=1)]
rows_with_missing_values_after_imputation

In [None]:
# prompt: check the data type of each column

# Check data types of each column
df.dtypes

# **EDA**

In [None]:
#creating visuals
#USE KDE TO SEE THE DISTRIBUTION BETTER

#distribution of goals scored
plt.figure(figsize=(10,10))
sns.histplot(df['Gls'], kde= True, color='maroon')
plt.title('Goals Scored')
plt.xlabel('Goals')
plt.ylabel('Count')
plt.show()

# #distribution of expected goals (xG)
# plt.figure(figsize=(10,10))
# sns.histplot(df['xG'], kde= True)
# plt.title('Expected goals')
# plt.xlabel('expected goals (xG)')
# plt.ylabel('Count')
# plt.show()

# #pass completion vs goals scored
# plt.figure(figsize=(10,10))
# sns.scatterplot(x=df['Touhces Att Pen'],y=df['xG'])
# plt.title('Touches in Penalty Box vs Goals Scored')
# plt.xlabel('pass completion (%)')
# plt.ylabel('goals scored')
# plt.show()


Explanation: The majority of players in our dataset scored less than 10 goals in their careers. Our distribution is skewed right. Only few players scored more than 30 goals. Our replacement is most likely among those who scored more than 30.

In [None]:
#distribution of expected goals (xG)
plt.figure(figsize=(10,10))
sns.histplot(df['xG'], kde= True,color='maroon')
plt.title('Expected goals')
plt.xlabel('expected goals (xG)')
plt.ylabel('Count')
plt.show()

Explanation: The histogram shows the distribution of players expected goals across the dataset. Majority of players have an expected goal value between 0 and 10, suggesting that most players in the dataset are expected to score about 5 goals.


In [None]:
#pass completion vs goals scored
plt.figure(figsize=(10,10))
sns.scatterplot(x=df['Touhces Att Pen'],y=df['xG'], color='maroon')
plt.title('Touches in Penalty Box vs Goals Scored')
plt.xlabel('touches in the penalty box')
plt.ylabel('goals scored')
plt.show()

Explanation: The scatter plot shows the relationship between touches in the penalty box vs amount of goals scored. There is a positive correlation, meaning that players who have more touches in the penalty box typically score more goals.  

In [None]:
df.sort_values(by='Gls', ascending=False).head(10)

NameError: name 'df' is not defined

In [None]:
selected_columns = ['Gls', 'xG', 'xAG','Shots', 'Pass Cmp %', 'Touhces Att Pen']

correlation_matrix = df[selected_columns].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', cbar=True)
plt.title('correlation matrix of key metrics')
plt.show()

Explanation: This correlation matrix shows the correlation coefficients between our key metrics. The highest coefficient on the matrix is between xG and Gls or expected goals and goals. This suggests a strong relationship between a player's goals and the player's expected goal. The lowest coefficients belong to Pass Cmp % or the percentage that passes are completed. This suggests that the amount of passes completed does not have any correlation to the expected goals of a player. It has a weak relationship between all our key metrics.

# **ANALYZING THE DATA**

In [None]:
''' Unsupervised learning - PCA dimensionality reduction + K-Means/K-NN Clustering to group players based on their different play styles and roles;
different players, even though they play the same position on the same side, often have completely different roles for their respective teams,
making finding a stylistic fit extrenely important to best replace a player. The player stats and their different combinations indicate different playstyles and roles'''

'''Supervises Learning - the player we want to scout a replacement for is Mohammed Salah. We want to select all the players that are in the same cluster he is (players with
similar playing styles), and select those observations as a new dataframe. We will use player name to index and get the original columns from the data. Then we can use
KNN regression to target the player who is predicted to have the highest xG, a metric indicative of player quality'''

'''unsupervised - cluster players on play style
supervised - from the players in the same cluster as Mohammed Salah, predict player with highest xG'''

In [None]:
df.columns

Here, we applied PCA dimension reduction to make it easier for k means clustering to cluster our players. We reduced all our features that were float values into 3 principal components.

After applying PCA, we now do K-Means clustering. We selected k to be 5 due to there being 5 different play styles/roles that forward players have.

1) 'Breaker' - best suited at 1v1s

2) 'Runner' - what do they do?

3) 'Creator' - the one who starts the attack

4) 'Magician' - exceptional creativity, high football IQ with many tools and techniques to utilize

5) 'Carrier' - what do they do?




In [None]:
#dimensionality reduction

#float columns
pca_cols = ['Avg Sht Dist', 'FK',
       'Pass Cmp %', 'Assisted Sht',
       'Completed Final Third', 'Passes into Pen Box', 'Crosses Cmpl',
       'Progressive Passes', 'Total Passing Distance',
       'Total Progressive Pass Distance', 'Short Passes Completed',
       'Short Passes Att', 'Short Passing Cmpl %', 'Medium Passes Cmpl',
       'Medium Passes Att', 'Medum Pass Cmpl %', 'Long Passes Cmpl',
       'Long Passes Att', 'Long Pass Cmpl %', 'Shot Creating Actions',
       'Live Pass to Shot', 'Dead Pass to Shot', 'Take-Ons to Shot',
       'Shots  to  Shot', 'Def Actions to Shot',
       'Goal-Creating Actions', 'Goal Creating Live Passes',
       'Goal Creating Dead Passes', 'Take-Ons to Goal', 'Shots to Goal Scored',
       'Fouls to Goals', 'Def Actions to Goal', 'Carries',
       'Total Carrying Distance', 'Total Progressive Distance',
       'Progressive Carries', 'Final Third Entry Carries', 'Carries into box',
       'Dispossessions', 'Passes Received',
       'Prgogressive Passes Received', 'Total Touches', 'Touches  Def Pen',
       'Touches  Def 3rd', 'Touches Mid 3', 'Touches Att 3rd',
       'Touhces Att Pen', 'Touches Live', 'Take-Ons Attempted',
       'Take-Ons Sucessful ', 'Take-On Success', 'Tackled', 'Tackled %',
       ]


data = df[pca_cols]

#pca
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

#PCA
pca = PCA(n_components=3)
data_pca = pca.fit_transform(scaled_data)

#k-means clustering =
kmeans = KMeans(n_clusters=5, random_state=42)

df['cluster'] = kmeans.fit_predict(data_pca)

clusters = df['cluster']


plt.figure(figsize=(10, 7))
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=clusters, cmap='Reds', s=50, alpha=0.7)
plt.title('Player Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()

After clustering our players, we take the cluster with Mohamed Salah and make a new dataframe with only the players that belong in the same cluster as Mohamed Salah. That cluster is labeled as '2'. We now apply KNN regression model to predict the xG value of each player. The player that matches or exceeds Mohamed Salah's xG value will be selected as his replacement.

In [None]:
mohamed_salah_cluster = df[df['Player'] == 'Mohamed Salah']['cluster'].values[0]

similar_players = df[df['cluster'] == mohamed_salah_cluster]

similar_players['Player']

In [None]:
#create model for xG from orignal dataset using only the same columns used for the PCA; we don't have a model for xg, or a way to predict it from our data
#we only have what's provided. By predicting xG from the data, we can


#slice a new dataframe with only clustered players along with their data

#apply model to the dataframe with only players in the same cluster as mohammed salah


#identify player withi the highest predicted xG

In [None]:
similar_players

In [None]:
features = ['Starts','SoT', '% SoT', 'Avg Sht Dist', 'FK',
       'Total Passes Comp', 'Total Passes Att', 'Pass Cmp %', 'Assisted Sht',
       'Completed Final Third', 'Passes into Pen Box', 'Crosses Cmpl',
       'Progressive Passes', 'Total Passing Distance',
       'Total Progressive Pass Distance', 'Short Passes Completed',
       'Short Passes Att', 'Short Passing Cmpl %', 'Medium Passes Cmpl',
       'Medium Passes Att', 'Medum Pass Cmpl %', 'Long Passes Cmpl',
       'Long Passes Att', 'Long Pass Cmpl %', 'Shot Creating Actions',
       'Live Pass to Shot', 'Dead Pass to Shot', 'Take-Ons to Shot',
       'Shots  to  Shot', 'Fouls Drawn', 'Def Actions to Shot',
       'Goal-Creating Actions', 'Goal Creating Live Passes',
       'Goal Creating Dead Passes', 'Take-Ons to Goal', 'Shots to Goal Scored',
       'Fouls to Goals', 'Def Actions to Goal', 'Carries',
       'Total Carrying Distance', 'Total Progressive Distance',
       'Progressive Carries', 'Final Third Entry Carries', 'Carries into box',
       'Miscontrols', 'Dispossessions', 'Passes Received',
       'Prgogressive Passes Received', 'Total Touches', 'Touches  Def Pen',
       'Touches  Def 3rd', 'Touches Mid 3', 'Touches Att 3rd',
       'Touhces Att Pen', 'Touches Live', 'Take-Ons Attempted',
       'Take-Ons Sucessful ', 'Take-On Success', 'Tackled', 'Tackled %']

target = 'xG'

X = similar_players[features]
y = similar_players[target]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_scaled, y)

In [None]:
similar_players.loc[:,'predicted_xG'] = knn.predict(X_scaled).copy()

top_player = similar_players.loc[similar_players['predicted_xG'].idxmax()]
top_player_name = top_player['Player']
top_player_predicted_xG = top_player['predicted_xG']

top_players_list = similar_players.nlargest(10, 'predicted_xG')['Player'].tolist()


print(f"Top player: {top_player_name}, Predicted xG: {top_player_predicted_xG}")
print("List of top players:", top_players_list)


In [None]:
# List of Top Forward players:
top_forwards = [
    "Harry Kane",
    "Erling Haaland",
    "Kylian Mbappé",
    "Robert Lewandowski",
    "Lautaro Martínez",
    "Loïs Openda",
    "Jonathan David",
    "Alexandre Lacazette",
    "Mohamed Salah",
    "Serhou Guirassy"
]

print("Top 10 Forwards in European Leagues:")
for i, player in enumerate(top_forwards, start=1):
    print(f"{i}. {player}")

while True:
    try:
        player_index = int(input("Enter the number corresponding to the player (1-10): "))
        if 1 <= player_index <= 10:
            selected_player = top_forwards[player_index - 1]
            break
        else:
            print("Please enter a number between 1 to 10.")
    except ValueError:
        print("Invalid input. Please enter a number.")

print(f"\nYou selected: {selected_player}")
selected_player_cluster = similar_players[similar_players['Player'] == selected_player]['cluster'].values[0]
similar_players_cluster = similar_players[similar_players['cluster'] == selected_player_cluster].copy()

features = [
    'Starts', 'SoT', '% SoT', 'Avg Sht Dist', 'FK',
    'Total Passes Comp', 'Total Passes Att', 'Pass Cmp %', 'Assisted Sht',
    'Completed Final Third', 'Passes into Pen Box', 'Crosses Cmpl',
    'Progressive Passes', 'Total Passing Distance',
    'Total Progressive Pass Distance', 'Short Passes Completed',
    'Short Passes Att', 'Short Passing Cmpl %', 'Medium Passes Cmpl',
    'Medium Passes Att', 'Medum Pass Cmpl %', 'Long Passes Cmpl',
    'Long Passes Att', 'Long Pass Cmpl %', 'Shot Creating Actions',
    'Live Pass to Shot', 'Dead Pass to Shot', 'Take-Ons to Shot',
    'Shots  to  Shot', 'Fouls Drawn', 'Def Actions to Shot',
    'Goal-Creating Actions', 'Goal Creating Live Passes',
    'Goal Creating Dead Passes', 'Take-Ons to Goal', 'Shots to Goal Scored',
    'Fouls to Goals', 'Def Actions to Goal', 'Carries',
    'Total Carrying Distance', 'Total Progressive Distance',
    'Progressive Carries', 'Final Third Entry Carries', 'Carries into box',
    'Miscontrols', 'Dispossessions', 'Passes Received',
    'Prgogressive Passes Received', 'Total Touches', 'Touches  Def Pen',
    'Touches  Def 3rd', 'Touches Mid 3', 'Touches Att 3rd',
    'Touhces Att Pen', 'Touches Live', 'Take-Ons Attempted',
    'Take-Ons Sucessful ', 'Take-On Success', 'Tackled', 'Tackled %'
]

target = 'xG'

X = similar_players_cluster[features].values
y = similar_players_cluster[target].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_scaled, y)

selected_player_stats = similar_players_cluster[similar_players_cluster['Player'] == selected_player][features].values
selected_player_scaled = scaler.transform(selected_player_stats)
selected_player_pred = knn.predict(selected_player_scaled)[0]

similar_players_cluster.loc[:, 'Predicted xG'] = knn.predict(X_scaled)
similar_players_without_selected = similar_players_cluster[similar_players_cluster['Player'] != selected_player].copy()
similar_players_without_selected.loc[:, 'xG Difference'] = abs(similar_players_without_selected['Predicted xG'] - selected_player_pred)
sorted_similar_players = similar_players_without_selected.sort_values(by=['xG Difference', 'Gls'], ascending=[True, False])

best_fit_player = sorted_similar_players.iloc[0]
print(f"\nBest Replacement for {selected_player}: {best_fit_player['Player']}")
print(best_fit_player[['Player', 'Matches Played', 'Gls', 'Predicted xG']])

print("\nSorted List of Closest Players by xG (ties resolved by Goals):")
print(sorted_similar_players[['Player', 'Matches Played', 'Gls', 'Predicted xG', 'xG Difference']].head())


Top 10 Forwards in European Leagues:
1. Harry Kane
2. Erling Haaland
3. Kylian Mbappé
4. Robert Lewandowski
5. Lautaro Martínez
6. Loïs Openda
7. Jonathan David
8. Alexandre Lacazette
9. Mohamed Salah
10. Serhou Guirassy
Enter the number corresponding to the player (1-10): 9

You selected: Mohamed Salah


NameError: name 'similar_players' is not defined