In [None]:
!pip install statsbombpy



In [None]:
import pandas as pd
import numpy as np
from statsbombpy import sb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the StatsBomb data
competitions = sb.competitions()
competition_id = 11  # La Liga
season_id = 4  # 2017/2018

# Get match IDs for the competition and season
matches = sb.matches(competition_id=competition_id, season_id=season_id)

# Fetch events for each match and concatenate them into a single DataFrame
event_data = pd.concat([sb.events(match_id) for match_id in matches['match_id']])



In [None]:
# Feature Engineering
def create_player_features(event_data):
    # Example of some features to extract
    event_data['pass_complete'] = event_data['pass_outcome'].apply(lambda x: 1 if x == 'Complete' else 0)
    event_data['tackles'] = event_data['type'].apply(lambda x: 1 if x == 'Tackle' else 0)
    event_data['shots'] = event_data['type'].apply(lambda x: 1 if x == 'Shot' else 0)
    event_data['goals'] = event_data['shot_outcome'].apply(lambda x: 1 if x == 'Goal' else 0)
    event_data['assists'] = event_data['pass_goal_assist'].apply(lambda x: 1 if x else 0)
    event_data['fouls_committed'] = event_data['type'].apply(lambda x: 1 if x == 'Foul Committed' else 0)
    event_data['interceptions'] = event_data['type'].apply(lambda x: 1 if x == 'Interception' else 0)

    # Group by player and aggregate the features
    player_stats = event_data.groupby('player_id').agg({
        'pass_complete': 'sum',
        'tackles': 'sum',
        'shots': 'sum',
        'goals': 'sum',
        'assists': 'sum',
        'fouls_committed': 'sum',
        'interceptions': 'sum'
    }).reset_index()

    return player_stats

player_stats = create_player_features(event_data)

  event_data['pass_complete'] = event_data['pass_outcome'].apply(lambda x: 1 if x == 'Complete' else 0)
  event_data['tackles'] = event_data['type'].apply(lambda x: 1 if x == 'Tackle' else 0)
  event_data['shots'] = event_data['type'].apply(lambda x: 1 if x == 'Shot' else 0)
  event_data['goals'] = event_data['shot_outcome'].apply(lambda x: 1 if x == 'Goal' else 0)
  event_data['assists'] = event_data['pass_goal_assist'].apply(lambda x: 1 if x else 0)
  event_data['fouls_committed'] = event_data['type'].apply(lambda x: 1 if x == 'Foul Committed' else 0)
  event_data['interceptions'] = event_data['type'].apply(lambda x: 1 if x == 'Interception' else 0)


In [None]:
def apply_weights(event_data, player_stats, matches):
    # Merge event_data with match outcomes using match_id
    event_data = event_data.merge(matches[['match_id', 'home_score', 'away_score', 'home_team', 'away_team']], on='match_id', how='left')

    # Determine if the player was on the winning or losing team
    event_data['team_won'] = event_data.apply(
        lambda row: 1 if (row['team'] == row['home_team'] and row['home_score'] > row['away_score']) or
                          (row['team'] == row['away_team'] and row['away_score'] > row['home_score'])
                          else 0, axis=1)

    event_data['team_lost'] = event_data.apply(
        lambda row: 1 if (row['team'] == row['home_team'] and row['home_score'] < row['away_score']) or
                          (row['team'] == row['away_team'] and row['away_score'] < row['home_score'])
                          else 0, axis=1)

    # Apply weights to player statistics
    player_stats['weighted_passes'] = player_stats['pass_complete'] * event_data['team_won'].mean()
    player_stats['weighted_tackles'] = player_stats['tackles'] * event_data['team_won'].mean()
    player_stats['weighted_shots'] = player_stats['shots'] * event_data['team_won'].mean()
    player_stats['weighted_goals'] = player_stats['goals'] * event_data['team_won'].mean()
    player_stats['weighted_assists'] = player_stats['assists'] * event_data['team_won'].mean()
    player_stats['weighted_fouls'] = player_stats['fouls_committed'] * event_data['team_lost'].mean()
    player_stats['weighted_interceptions'] = player_stats['interceptions'] * event_data['team_won'].mean()

    return player_stats

# Assuming player_stats is already computed
player_stats = apply_weights(event_data, player_stats, matches)

In [None]:
# Create a synthetic player rating based on weighted features
player_stats['rating'] = player_stats[['weighted_passes', 'weighted_tackles', 'weighted_shots',
                                       'weighted_goals', 'weighted_assists',
                                       'weighted_fouls', 'weighted_interceptions']].sum(axis=1)

# Normalize the ratings to a 0-10 scale
player_stats['rating'] = 10 * (player_stats['rating'] / player_stats['rating'].max())

# Prepare the features and labels
X = player_stats[['weighted_passes', 'weighted_tackles', 'weighted_shots',
                  'weighted_goals', 'weighted_assists',
                  'weighted_fouls', 'weighted_interceptions']]
y = player_stats['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the RandomForest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.3480067449417202
R-squared: 0.9010126367325975
