In [1]:
from mplsoccer import Sbopen
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
import streamlit as st
from joblib import load
from db_connection import get_db

In [2]:
statsbomb_api = Sbopen()
db = next(get_db())
competition_list = statsbomb_api.competition()
competitions = competition_list[~competition_list['match_available_360'].isna()]

matches = []
for _, competition in competitions.iterrows():
    if  True or competition.season_id == 282:
        df_match = statsbomb_api.match(competition_id=competition['competition_id'] , season_id=competition['season_id'])
        new = df_match.match_id.unique()
        matches.extend(new)

In [3]:
len(set(matches))

425

In [None]:
import time

from matplotlib import widgets

event_df = pd.DataFrame()
track_df = pd.DataFrame()

progress = widgets.IntProgress(
    value=0,
    min=0,
    max=len(set(matches)),
    description='Loading:',
    bar_style='',
    style={'bar_color': 'blue'},
    orientation='horizontal'
)
display(progress)
    
for match in matches:
    try:
        df = statsbomb_api.event(match)
        df2 = statsbomb_api.frame(match)
        df_event = df[0]
        df_track = df2[0]
        event_df = pd.concat([event_df, df_event], ignore_index=True)
        track_df = pd.concat([track_df, df_track], ignore_index=True)
        event_df.reset_index(drop=True, inplace=True)
        track_df.reset_index(drop=True, inplace=True)
        progress.value +=1
    except:
        print('except')
        time.sleep(1)

except


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000002703EECA450>>
Traceback (most recent call last):
  File "C:\Users\juras\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                                                 ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\juras\miniconda3\envs\football-scouting\Lib\threading.py", line 1535, in enumerate
    def enumerate():
    
KeyboardInterrupt: 


In [None]:
event_df = event_df.dropna(subset=['x', 'y'])
event_df = event_df[~event_df[['x', 'y']].isin([float('inf'), float('-inf')]).any(axis=1)]

In [None]:
event_df[event_df[['end_x', 'end_y']].isna().any(axis=1)].type_name.value_counts()


In [None]:
# if no end => start == end
event_df.loc[event_df['end_x'].isna(), 'end_x'] = event_df['x']
event_df.loc[event_df['end_y'].isna(), 'end_y'] = event_df['y']

# Define Pitch Zones


In [None]:
print(event_df['x'].min(),event_df['x'].max())
print(event_df['y'].min(),event_df['y'].max())

In [None]:
def get_pitch_zone(x, y):
    # Define horizontal zones (x-axis)
    if x < 40:
        horiz_zone = 'Defensive'
    elif x <= 80:
        horiz_zone = 'Middle'
    else:
        horiz_zone = 'Attacking'

    # Define vertical zones (y-axis)
    if y < 20:
        vert_zone = 'Right'
    elif y <= 60:
        vert_zone = 'Center'
    else:
        vert_zone = 'Left'
    
    return f"{horiz_zone} {vert_zone}"

# Apply function to create start and end zones
event_df['start_zone'] = event_df.apply(lambda row: get_pitch_zone(row['x'], row['y']), axis=1)
event_df['end_zone'] = event_df.apply(lambda row: get_pitch_zone(row['end_x'], row['end_y']), axis=1)


In [None]:
df_event = event_df

df_event["position_name"].unique()

position_map = {

    "Center Forward": "Striker",
    "Left Center Forward": "Striker",
    "Right Center Forward": "Striker",
    "Second Striker": "Striker",
    "Striker": "Striker",

    # --- Center Backs ---
    "Center Back": "Center Back",
    "Left Center Back": "Center Back",
    "Right Center Back": "Center Back",

    # --- Fullbacks & Wingbacks ---
    "Left Back": "Wide Back",
    "Right Back": "Wide Back",
    "Left Wing Back": "Wide Back",
    "Right Wing Back": "Wide Back",

    # --- Central Midfielders ---
    "Center Midfield": "Center Midfield",
    "Left Center Midfield": "Center Midfield",
    "Right Center Midfield": "Center Midfield",
    "Center Defensive Midfield": "Center Midfield",
    "Left Defensive Midfield": "Center Midfield",
    "Right Defensive Midfield": "Center Midfield",
    "Center Attacking Midfield": "Center Midfield",
    "Left Attacking Midfield": "Center Midfield",
    "Right Attacking Midfield": "Center Midfield",

    # --- Wide Midfielders / Wingers ---
    "Left Midfield": "Wide Midfield",
    "Right Midfield": "Wide Midfield",
    "Left Wing": "Wide Midfield",
    "Right Wing": "Wide Midfield",

    "Goalkeeper": "Goalkeeper",
}

df_event["pos_group"] = df_event["position_name"].map(position_map)

# Step 2: Compute Movement Features

In [None]:
df_event['distance'] = np.sqrt((df_event['x'] - df_event['end_x']) ** 2 + (df_event['y'] - df_event['end_y']) ** 2)
df_event['progression'] = df_event['end_x'] - df_event['x']
df_event['directness'] = np.where(df_event['distance'] != 0, df_event['progression'] / df_event['distance'], 0)
df_event['width_change'] = abs(df_event['end_y'] - df_event['y'])
#df_event['speed'] = np.where(df_event['duration'] != 0, df_event['distance'] / df_event['duration'], 0)
#df_event['distance_covered'] = np.sqrt((df_event['end_x'] - df_event['x']) ** 2 + (df_event['end_y'] - df_event['y']) ** 2)
#df_event['angle_change'] = np.arctan2(df_event['end_y'] - df_event['y'], df_event['end_x'] - df_event['x'])

Packing Metrics

In [None]:
id_pos = event_df.loc[:,['id', 'x', 'y', 'end_x', 'end_y']]
id_pos = id_pos.rename(columns={'x': 'ball_x', 'y': 'ball_y'})
track_df_full = track_df.merge(id_pos, on='id', how='left')
track_df_full = track_df_full[track_df_full['actor']==False]
track_df_full['distance'] = np.sqrt((track_df_full['x'] - track_df_full['ball_x'])**2 + (track_df_full['y'] - track_df_full['ball_y'])**2)

# ball went behind the opponent
track_df_full['packing'] = (track_df_full['x'] > track_df_full['ball_x']) & (track_df_full['x'] < track_df_full['end_x'])


teammates_df = track_df_full[track_df_full['teammate'] == True]
teammates_df = teammates_df.rename(columns={'distance': 'teammate_distance'})

teammates_df['teammate_behind_ball'] = (teammates_df['ball_x'] > teammates_df['ball_x'])

opponents_df = track_df_full[track_df_full['teammate'] == False]
opponents_df = opponents_df.rename(columns={'distance': 'opponent_distance'})

track_df_full['nearby_opponents'] = (track_df_full['distance'] < 10) & (track_df_full['teammate']==False)
track_df_full['nearby_teammates'] = (track_df_full['distance'] < 10) & (track_df_full['teammate']==True)


teammate_distance_min = teammates_df.groupby(['id'])['teammate_distance'].min().reset_index()
opponent_distance_min = opponents_df.groupby('id')['opponent_distance'].min().reset_index()



distance_count = track_df_full.groupby('id')['nearby_opponents'].sum().reset_index()
distance_count_teammates = track_df_full.groupby('id')['nearby_teammates'].sum().reset_index()
packing = opponents_df.groupby('id')['packing'].sum().reset_index()
teammate_forward = teammates_df.groupby('id')['teammate_behind_ball'].sum().reset_index()


df_event = df_event.merge(distance_count, on='id', how='left')
df_event = df_event.merge(distance_count_teammates, on='id', how='left')
df_event = df_event.merge(teammate_distance_min, on='id', how='left')
df_event = df_event.merge(opponent_distance_min, on='id', how='left')
df_event = df_event.merge(packing, on='id', how='left')
df_event = df_event.merge(teammate_forward, on='id', how='left')

# Player level aggregation


In [None]:
print(", ".join(df_event.columns.to_list()))
df_event['sub_type_name'].unique()
df_event['body_part_name'].unique()
df_event[df_event['type_name'] == "Dribble"]['outcome_name'].unique()
#df_event[df_event['type_name'] == "Shot"]['shot_first_time'].unique()


In [None]:
df_pressure = df_event.groupby(['player_id'])['under_pressure'].sum().reset_index()
df_pressure.rename(columns={'under_pressure': 'pressure_count'}, inplace=True)

# Count event types 
df_counts = df_event.groupby(['player_id'])['type_name'].value_counts().unstack(fill_value=0).reset_index()
df_counts = df_counts[['player_id', 'Pass', 'Carry', 'Dribble', 'Shot', 'Duel', 'Pressure', 'Block', 'Ball Recovery', 'Interception', 'Clearance', 'Foul Won', 'Foul Committed']].fillna(0)

df_position_groups = df_event.groupby(['player_id'])['pos_group'].value_counts(normalize=True).unstack(fill_value=0).reset_index()


# Passes

df_pass = df_event[(df_event['type_name'] == 'Pass')].groupby(['player_id']).agg(
    all_passes = ('pass_length', 'count'),
    successful_passes = ('outcome_name', lambda x: (x.isin([np.nan, 'Complete', 'Success', 'Success In Play', 'Success To Team'])).sum()),
    avg_pass_length = ('pass_length', 'mean'),
    avg_pass_angle = ('pass_angle', 'mean'),
    longest_forward_pass = ('progression', 'max'),
    pass_switch = ('pass_switch', 'count'),
    pass_cut_back = ('pass_cut_back', 'count')
).reset_index()
df_pass['pass_success_rate'] = df_pass['successful_passes'] / df_pass['all_passes']
df_pass.fillna(0, inplace=True)

df_pass_types = df_event[df_event['type_name'] == 'Pass'].groupby(['player_id'])['pass_height_name'].value_counts(normalize=True).unstack(fill_value=0).reset_index()
df_pass_types.columns = ['player_id'] + [f'{col.lower().replace(" ", "_")}' for col in df_pass_types.columns[1:]]

# Shots

df_shot = df_event[(df_event['type_name'] == 'Shot')].groupby(['player_id']).agg(
    all_shots = ('id', 'count'),
    shot_mean_length = ('x', 'mean'),
    headers = ('body_part_name', lambda x: (x.isin(['Head'])).sum()),
    extra_shots = ('technique_name', lambda x: (~x.isin(['Normal'])).sum()),
    first_time_shots = ('shot_first_time', 'count'),
    total_xg = ('shot_statsbomb_xg', 'sum')
)
df_shot.fillna(0, inplace=True)


# Dribbles
df_event['forward'] = df_event["x"] - df_event["end_x"]
df_event['vertical'] = abs(df_event["y"] - df_event["end_y"])

df_dribble = df_event[(df_event['type_name'] == 'Dribble')].groupby(['player_id']).agg(
    successful_dribbles = ('outcome_name', lambda x: (x.isin(['Complete'])).sum()),
    dribble_length_forward = ('forward', 'mean'),
    dribble_length_vertical =  ('vertical', 'mean'),
    dribble_start_x = ('x', 'mean')
)
df_dribble.fillna(0, inplace=True)


# Aggregate movement features
df_movement = df_event.groupby(['player_id']).agg(
    total_distance=('distance', 'sum'),
    avg_progression=('progression', 'mean'),
    avg_directness=('directness', 'mean'),
    total_width_change=('width_change', 'sum'),
).reset_index()



# Aggregate pitch zone counts (start zones)
df_pitch_zones = df_event.groupby(['player_id'])['start_zone'].value_counts(normalize=True).unstack(fill_value=0).reset_index()
df_pitch_zones.columns = ['player_id'] + [f'start_{col.lower().replace(" ", "_")}' for col in df_pitch_zones.columns[1:]]

# Aggregate pitch zone counts (end zones)
df_pitch_zones_end = df_event.groupby(['player_id'])['end_zone'].value_counts(normalize=True).unstack(fill_value=0).reset_index()
df_pitch_zones_end.columns = ['player_id'] + [f'end_{col.lower().replace(" ", "_")}' for col in df_pitch_zones_end.columns[1:]]


# Pressure & support
df_pressure_support = df_event.groupby(['player_id']).agg(
    avg_opponent_distance = ('opponent_distance', 'mean'),
    avg_teammate_distance = ('teammate_distance', 'mean'),
    avg_opponent_count = ('nearby_opponents', 'mean'),
    avg_teammate_count = ('nearby_teammates', 'mean'),
    avg_advanced_players = ('teammate_behind_ball', 'mean')
    ).reset_index()

# Packing
df_packing = df_event.groupby(['player_id']).agg(
    total_packing=('packing', 'sum')).reset_index()


df_counterpress = df_event.groupby(['player_id'])['counterpress'].sum().reset_index()
df_counterpress.rename(columns={'counterpress': 'counterpress_count'}, inplace=True)


df_aerial_duels = df_event.groupby(['player_id'])['aerial_won'].sum().reset_index()
df_aerial_duels.rename(columns={'aerial_won': 'aerial_duels_won'}, inplace=True)

df_duration = df_event.groupby(['player_id']).agg(avg_duration=('duration', 'mean')).reset_index()


# Merge all aggregated data into a possession-level DataFrame
player_features = df_counts \
    .merge(df_pass, on=['player_id'], how='left') \
    .merge(df_shot, on=['player_id'], how='left') \
    .merge(df_dribble, on=['player_id'], how='left') \
    .merge(df_pressure, on=['player_id'], how='left') \
    .merge(df_movement, on=['player_id'], how='left') \
    .merge(df_pass_types, on=['player_id'], how='left') \
    .merge(df_position_groups, on=['player_id'], how='left') \
    .merge(df_pitch_zones, on=['player_id'], how='left') \
    .merge(df_pitch_zones_end, on=['player_id'], how='left') \
    .merge(df_counterpress[['player_id', 'counterpress_count']], on=['player_id'], how='left') \
    .merge(df_aerial_duels[['player_id', 'aerial_duels_won']], on=['player_id'], how='left') \
    .merge(df_duration[['player_id', 'avg_duration']], on=['player_id'], how='left') \
    .merge(df_pressure_support, on=['player_id'], how='left') \
    .merge(df_packing, on=['player_id'], how='left')

# Fill NaN values with 0 where needed
player_features.fillna(0, inplace=True)
possession_features_original = player_features.copy()


In [None]:
player_features

In [None]:
columns_to_convert = [ 'pass_switch', 'pass_cut_back', 'aerial_duels_won']

for col in columns_to_convert:
    player_features[col] = player_features[col].astype(float)

In [None]:
player_features.columns

In [None]:
from sql_schemas import PlayingTimes, Competitions
from db_connection import get_db
from sqlalchemy import text

db = next(get_db())
playing_times = db.query(PlayingTimes).all()
db.close()
df = pd.DataFrame([vars(r) for r in playing_times]).drop(columns="_sa_instance_state")
df
total_minutes_df = df.groupby(["player_id", "player_name"])["minutes"].sum().reset_index()
total_minutes_df
player_features = player_features.merge(total_minutes_df, on=['player_id'], how='left')

In [None]:
player_features.to_excel('../../excel/player_features.xlsx', index=False)

In [None]:
player_features = pd.read_excel('../../excel/player_features.xlsx')

In [None]:
import pandas as pd

# Wide Focus = Width-related features (Summing all start & end width-related positions)
player_features["wide_focus"] = (
    player_features["start_attacking_left"] +
    player_features["start_attacking_right"] +
    player_features["start_middle_left"] +
    player_features["start_middle_right"] +
    player_features["start_defensive_left"] +
    player_features["start_defensive_right"] +
    player_features["end_attacking_left"] +
    player_features["end_attacking_right"] +
    player_features["end_middle_left"] +
    player_features["end_middle_right"] +
    player_features["end_defensive_left"] +
    player_features["end_defensive_right"]
)

player_features["defensive_start"] = (
    player_features["start_defensive_left"] +
    player_features["start_defensive_right"] +
    player_features["start_defensive_center"]
)
player_features["middle_start"] = (
    player_features["start_middle_left"] +
    player_features["start_middle_right"] +
    player_features["start_middle_center"]
)
player_features["attacking_start"] = (
    player_features["start_attacking_left"] +
    player_features["start_attacking_right"] +
    player_features["start_attacking_center"]
)



In [None]:

per90_columns = ['Pass', 'Carry', 'Dribble', 'Shot', 'Duel', 'Pressure',
       'Block', 'Ball Recovery', 'Interception', 'Clearance', 'Foul Won',
       'Foul Committed', 'pass_switch', 'pass_cut_back', 'mean_length',
       'headers', 'extra_shots', 'total_xg',
       'successful_dribbles', 'pressure_count',
       'total_distance', 'total_width_change', 'counterpress_count',
       'aerial_duels_won', 'total_packing', 'wide_focus']
player_features[per90_columns] = player_features[per90_columns].div(player_features["minutes"], axis=0) * 90
player_features.to_excel('../../excel/player_features_p90.xlsx', index=False)

In [None]:

# Dropping original columns to avoid redundancy
features_to_drop = [
    # Wide Focus Components
    "start_attacking_left", "start_attacking_right", "start_middle_left", "start_middle_right", "start_defensive_left", "start_defensive_right",
    "start_attacking_center", "start_defensive_center", "start_middle_center",
    "end_attacking_left", "end_attacking_right", "end_middle_left", "end_middle_right", "end_defensive_left", "end_defensive_right",
    "end_attacking_center", "end_defensive_center", "end_middle_center",


    'all_passes', 'successful_passes','all_shots',
    # Highly Correlated Features
    #"ground_pass","total_width_change", "successful_passes", "pressure_count","all_passes", "total_distance",
    #"total_width_change", "total_packing", "total_duration", "wide_focus", "defensive_intensity", "on_ball_actions", "defensive_actions"
    # 'avg_directness','low_pass', 'Shot',
    #   'pass_switch', 'pass_cut_back', 'aerial_duels_won', 'defensive_start', 'middle_start'
]
player_features_dr = player_features.drop(columns=features_to_drop)

In [None]:
player_features_dr.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

non_numeric_cols = ['player_id', 'player_name', 'minutes']

df_numeric = player_features_dr.drop(columns=non_numeric_cols, errors='ignore')
corr_matrix = df_numeric.corr()
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", center=0, linewidths=0.5, vmin=-1, vmax=1)
plt.title("Feature Correlation Heatmap", fontsize=14)
plt.show()

corr_matrix = df_numeric.corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr_pairs = [
    (col, row, upper_triangle.loc[row, col])
    for col in upper_triangle.columns
    for row in upper_triangle.index
    if upper_triangle.loc[row, col] > 0.80
]

if high_corr_pairs:
    print("Highly correlated feature pairs (correlation > 0.80):")
    for col1, col2, corr_value in high_corr_pairs:
        print(f"{col1} and {col2}: {corr_value:.2f}")
else:
    print("No feature pairs with correlation greater than 0.80.")


In [None]:
df_numeric.columns  

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# 1️⃣ Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_numeric)

# 2️⃣ Apply PCA
pca = PCA()
pca.fit(scaled_features)

# 3️⃣ Plot Explained Variance
explained_variance = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
plt.axhline(y=0.90, color='r', linestyle='--', label='90% Variance Threshold')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA - Explained Variance')
plt.grid()
plt.legend()
plt.show()

# 4️⃣ Determine the Optimal Number of Components
n_components = np.argmax(explained_variance >= 0.90) + 1  # Selecting components explaining ≥90% variance
print(f"Optimal number of components: {n_components}")

# 5️⃣ Transform Data Using PCA
pca_final = PCA(n_components=n_components)
pca_features = pca_final.fit_transform(scaled_features)

# 6️⃣ Create DataFrame with PCA components
pca_df = pd.DataFrame(pca_features, columns=[f'PC{i+1}' for i in range(n_components)])

# 7️⃣ Analyze Feature Loadings (Contribution of each feature to principal components)
loadings = pd.DataFrame(pca_final.components_.T, index=df_numeric.columns, columns=[f'PC{i+1}' for i in range(n_components)])
print("PCA Feature Loadings:")
print(loadings)

# 8️⃣ Visualize the first two principal components
plt.figure(figsize=(8, 6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], alpha=0.6)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Projection (PC1 vs PC2)')
plt.grid()
plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt

# 1️⃣ Determine the optimal number of clusters using the Elbow Method
wcss = []  # Within-cluster sum of squares

max_cluster_test = 50

for i in range(2, max_cluster_test):  # Testing between 2 and 10 clusters
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(pca_df)
    wcss.append(kmeans.inertia_)

# Plotting the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range(2, max_cluster_test), wcss, marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.grid()
plt.show()

# 2️⃣ Silhouette Score to support the elbow method
silhouette_scores = []
for n_clusters in range(2, max_cluster_test):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(pca_df)
    silhouette_avg = silhouette_score(pca_df, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot Silhouette Scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, max_cluster_test), silhouette_scores, marker='s', linestyle='-', color='orange')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal Clusters')
plt.grid()
plt.show()

# 3️⃣ Apply K-Means with the optimal number of clusters (let's assume 4 based on elbow/silhouette)
optimal_clusters = 23  # Adjust this based on your elbow/silhouette plot results
kmeans_final = KMeans(n_clusters=optimal_clusters, random_state=42)

# Predict clusters
pca_df['Cluster'] = kmeans_final.fit_predict(pca_df)


# 5️⃣ Analyze the average feature values per cluster (to profile playing styles)
cluster_profiles = pd.concat([pca_df, df_numeric.reset_index(drop=True)], axis=1)
cluster_summary = cluster_profiles.groupby('Cluster').mean()


In [None]:
player_features["Cluster"] = pca_df['Cluster']
player_features.columns
names_unique = event_df.loc[:,['player_id']].drop_duplicates(subset=["player_id"]).reset_index()
player_features_with_name = player_features.merge(names_unique, on="player_id", how="left")

In [None]:
player_features_with_name[player_features_with_name["Cluster"]==21].loc[:,["Goalkeeper", "Center Back", "Wide Back", "Center Midfield", "Wide Midfield", "Striker", "player_name", "minutes"]]

In [None]:
a = player_features_with_name.loc[:,["Cluster", 'avg_pass_length', 'wide_focus', 'attacking_start','total_xg']].groupby(["Cluster"]).describe().T
a

In [None]:
cluster_summary.T

In [None]:
pca_df

In [None]:
player_features['Cluster'] = pca_df['Cluster']
df_event_with_clusters = df_event.merge(
    player_features[['player_id', 'Cluster']],
    on=['player_id'],
    how='left')
df_event_with_clusters

In [None]:
# 2️⃣ Calculate Cluster Distribution per Team
team_playing_styles = df_event_with_clusters.groupby(['player_id']).size().unstack(fill_value=0)


# Normalize to get percentages
team_playing_styles_percentage = team_playing_styles.div(team_playing_styles.sum(axis=1), axis=0) * 100

team_playing_styles_percentage




In [None]:
import matplotlib.pyplot as plt

# 3️⃣ Stacked Bar Plot for Team Playing Styles
plt.figure(figsize=(12, 8))
team_playing_styles_percentage.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='Set2')

plt.title('Team Playing Style Distribution')
plt.xlabel('Team')
plt.ylabel('Percentage of Possessions')
plt.legend(title='Playing Style Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(12, 10))
sns.heatmap(team_playing_styles_percentage, annot=True, fmt='.1f', cmap='YlGnBu', linewidths=0.5)

plt.ylabel('Team')
plt.tight_layout()

plt.xticks(rotation=20)

plt.show()


In [None]:
# 1️⃣ Mapping Cluster Numbers to Style Names
cluster_to_style = {
    0: 'High long ball from defense',
    1: 'Build-up against high press',
    2: 'Set pieces or crosses close to goal',
    3: 'Regular attack transition',
    4: 'Long possession during build-up'
}

# Apply the mapping
team_playing_styles_percentage.rename(columns=cluster_to_style, inplace=True)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 2️⃣ Radar Plot Function
def plot_team_playing_style_radar(team_name, data):
    styles = data.columns.tolist()
    values = data.loc[team_name].values.tolist()

    # Close the radar plot
    values += values[:1]
    angles = np.linspace(0, 2 * np.pi, len(styles), endpoint=False).tolist()
    angles += angles[:1]

    # Plotting
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    ax.plot(angles, values, linewidth=2, linestyle='solid', label=team_name)
    ax.fill(angles, values, alpha=0.25)

    # Setting labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(styles, fontsize=10)
    ax.set_yticklabels([])  # Remove radial ticks for clarity

    plt.title(f'Playing Style Radar for {team_name}', size=14, y=1.1)
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.show()

# Example: Plotting for a specific team
plot_team_playing_style_radar('Croatia', team_playing_styles_percentage)


In [None]:
plot_team_playing_style_radar('Spain', team_playing_styles_percentage)

In [None]:
player_features

In [None]:
from requests import Session
from sql_schemas import Possessions


def init_possessions(db: Session, df, tableClass = Possessions):
    db.query(tableClass).delete()
    db.commit()
    new_rows = [
        tableClass(
            match_id=p['match_id'], possession=p['possession'], 
            playing_style=cluster_to_style.get(p['playing_style']),
            passes=p['Pass'],
            carry=p['Carry'],
            dribble=p['Dribble'],
            shot=p['Shot'],
            duel=p['Duel'],
            pressure=p['Pressure'],
            block=p['Block'],
            ball_recovery=p['Ball Recovery'],
            interception=p['Interception'],
            clearance=p['Clearance'],
            avg_pass_length=p['avg_pass_length'],
            pass_success_rate=p['pass_success_rate'],
            possession_team_name=p['possession_team_name']
        )
        for p in df.to_dict(orient="records")
    ]
    print(new_rows)
    if new_rows:
        db.bulk_save_objects(new_rows)
    db.commit()

In [None]:
db.rollback()

In [None]:
possession_features_original['Cluster'] = player_features['Cluster']
possession_features_original.to_csv('test.csv')



In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assume you already have X and cluster labels
X = pd.read_csv('test.csv')



In [None]:
df = possession_features_original
df['playing_style'] = df['Cluster']
possession_team = event_df.groupby(['match_id', 'possession'])['possession_team_name'].max().reset_index()
possession_team
df = df.merge(possession_team[['match_id', 'possession', 'possession_team_name']], on=['match_id', 'possession'], how='left')
df
#df_final = df.rename(columns={"Pass": "passes"})
#df_final

init_possessions(db,df)

In [None]:

team_possessions = df.groupby(['possession_team_name', 'Cluster']).size().unstack(fill_value=0)


# Normalize to get percentages
team_possessions_percentage = team_possessions.div(team_possessions.sum(axis=1), axis=0) * 100
plt.figure(figsize=(14, 8))
sns.heatmap(team_possessions_percentage, annot=True, fmt='.1f', cmap='YlGnBu', linewidths=0.5)

plt.title('Heatmap of Team Playing Styles (%)')
plt.xlabel('Playing Style Cluster')
plt.ylabel('Team')
plt.tight_layout()

plt.show()


In [None]:
plot_team_playing_style_radar('Spain', team_possessions_percentage)

In [None]:
# Cluster
df_numeric['Cluster'] = df['Cluster']
labels = df['Cluster']

for cluster_id in np.unique(labels):
    cluster_points = scaled_features[labels == cluster_id]
    for row in cluster_points:
        plt.plot(row, alpha=0.2, color=f'C{cluster_id}')
    plt.plot(cluster_points.mean(axis=0), color=f'C{cluster_id}', label=f'Cluster {cluster_id}', linewidth=2)

plt.xlabel("Feature Index")
plt.ylabel("Feature Value")
plt.title("Feature-wise Profile of Each Cluster")
plt.legend()
plt.show()

In [None]:
sns.pairplot(df_numeric.loc[:,['start_x', 'end_x', 'total_xg', 'longest_forward_pass', 'Cluster']], hue='Cluster', corner=True, diag_kind='hist')

In [None]:
centers = kmeans_final.cluster_centers_
print(centers.shape) 
centers_original_space = pca_final.inverse_transform(centers)
cluster_centers_orig = scaler.inverse_transform(centers_original_space)

columns = [col for col in df_numeric.columns if col != 'Cluster']
df_centers = pd.DataFrame(cluster_centers_orig, columns=columns)

plt.figure(figsize=(12, 8))
sns.heatmap(df_centers.T, annot=True, fmt=".2f", cmap='coolwarm')
plt.xlabel("Features")
plt.ylabel("Clusters")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
cluster_summary_cleaned = cluster_summary.loc[:, ~cluster_summary.columns.str.startswith('PC')]
plt.figure(figsize=(12, 8))  # adjust size as needed
sns.heatmap(cluster_summary_cleaned.T, annot=True, fmt=".2f", cmap='coolwarm')
plt.xlabel("Features")
plt.ylabel("Clusters")
plt.show()
cluster_summary.T.columns

In [None]:
from pandas import DataFrame

print(scaled_features)
cluster_profiles = pd.DataFrame(scaled_features, columns=df_numeric.columns[:-1], index=df_numeric.index)
cluster_profiles['Cluster'] = df_numeric['Cluster']
cluster_summary = cluster_profiles.groupby('Cluster').mean()
plt.figure(figsize=(12, 6))
sns.heatmap(cluster_summary.T, annot=True, fmt=".2f", cmap='coolwarm')
plt.xlabel("Features")
plt.ylabel("Clusters")
plt.show()

In [None]:
df_event_with_clusters.columns

In [None]:
df_event_with_clusters.loc[:,['type_name','player_name','position_name','Cluster']]
df_event_with_clusters = df_event_with_clusters[df_event_with_clusters['position_name'].str.contains("Forward", na=False)]

In [None]:
a = df_event_with_clusters.groupby(['player_name', 'Cluster'])['type_name'].value_counts().reset_index()

b = df_event_with_clusters[df_event_with_clusters['type_name'] == 'Pass'].groupby(['player_name', 'Cluster'])['outcome_name'].apply(lambda g: g.isna().mean()).reset_index()
a[a['Cluster'] == 4].sort_values(by='count', ascending=False).head(20)

In [None]:
a = a.merge(b, on=['player_name', 'Cluster'], how='left')

In [None]:
c = a[a['type_name'] == 'Dribble'][a['count']> 3]
c.sort_values(by='outcome_name', ascending=False).head(20)