In [None]:
!git clone https://github.com/statsbomb/open-data.git

Cloning into 'open-data'...
remote: Enumerating objects: 49843, done.[K
remote: Counting objects: 100% (5351/5351), done.[K
remote: Compressing objects: 100% (1332/1332), done.[K
remote: Total 49843 (delta 5243), reused 4097 (delta 3999), pack-reused 44492 (from 1)[K
Receiving objects: 100% (49843/49843), 6.45 GiB | 18.02 MiB/s, done.
Resolving deltas: 100% (46913/46913), done.
Updating files: 100% (7246/7246), done.


In [None]:
import os
import pandas as pd
import json
from statsbombpy import sb

# Path to the directory containing the event files
event_files_path = '/content/open-data/data/events'

# Function to load and process each event file
def process_event_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        events = json.load(f)
    return pd.json_normalize(events)

# Sample a fraction of the event data (e.g., 10%)
sampling_fraction = 0.1

# Loop through all event files and process them
all_event_data = []
for i, file_name in enumerate(os.listdir(event_files_path)):
    if file_name.endswith('.json'):
        file_path = os.path.join(event_files_path, file_name)
        event_df = process_event_file(file_path)

        # Randomly sample a fraction of the data
        sampled_df = event_df.sample(frac=sampling_fraction, random_state=42)
        all_event_data.append(sampled_df)

        # Optionally, break early if too many files are loaded
        if i >= 20:  # Load only the first 20 files for this example
            break

# Concatenate all event data into a single DataFrame
all_event_data_df = pd.concat(all_event_data, ignore_index=True)

# Display the size of the data to ensure it fits into RAM
print(f"Total events loaded: {len(all_event_data_df)}")

Total events loaded: 7677


In [None]:
def create_features(event_df):
    # Extract key match context features
    event_df['minute'] = event_df['minute']

    # Add additional features as needed (e.g., counts of events, etc.)
    player_actions = event_df.groupby('player.id').agg({
        'pass.outcome.name': 'count', # Changed from 'sum' to 'count' to avoid string concatenation
        'shot.outcome.name': 'count', # Changed from 'sum' to 'count' to avoid string concatenation
        'foul_committed.card.name': 'count' # Changed from 'sum' to 'count' to avoid string concatenation
    }).reset_index()

    # Rename the columns to be more descriptive
    player_actions.rename(columns={
        'pass.outcome.name': 'total_passes',
        'shot.outcome.name': 'total_shots',
        'foul_committed.card.name': 'total_fouls'
    }, inplace=True)

    # Merge player actions back into the main DataFrame using a left merge to preserve 'event_type'
    event_df = event_df.merge(player_actions, on='player.id', how='left')

    # Convert columns to numeric, replacing non-numeric values with 0
    for col in ['total_passes', 'total_shots', 'total_fouls']:
        event_df[col] = pd.to_numeric(event_df[col], errors='coerce').fillna(0)

    return event_df

# Apply create_features to the DataFrame
all_event_data_df = create_features(all_event_data_df)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Prepare features and labels
features = all_event_data_df[['minute', 'total_passes', 'total_shots', 'total_fouls']]  # Add more features as needed
labels = all_event_data_df['type.name'].apply(lambda x: 1 if x == 'Substitution' else 0)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train the model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))


Model Accuracy: 0.9973958333333334


In [None]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'substitution_model.pkl')

['substitution_model.pkl']

In [None]:
!pip install flask-cors

Collecting flask-cors
  Downloading Flask_Cors-4.0.1-py2.py3-none-any.whl.metadata (5.5 kB)
Downloading Flask_Cors-4.0.1-py2.py3-none-any.whl (14 kB)
Installing collected packages: flask-cors
Successfully installed flask-cors-4.0.1


In [None]:
%%writefile app.py
from flask import Flask, request, jsonify
import joblib
import numpy as np
from flask_cors import CORS
import logging

# Setup logging
logging.basicConfig(level=logging.DEBUG)

# Load the trained model
try:
    model = joblib.load('substitution_model.pkl')
    logging.info("Model loaded successfully.")
except Exception as e:
    logging.error(f"Error loading model: {e}")

app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

@app.route('/predict', methods=['POST'])
def predict():
    if request.method == 'OPTIONS':
        response = jsonify({})
        response.headers.add("Access-Control-Allow-Origin", "*")
        response.headers.add("Access-Control-Allow-Headers", "Content-Type")
        response.headers.add("Access-Control-Allow-Methods", "POST, OPTIONS")
        return response
    try:
        data = request.get_json()
        current_events = data.get("events")

        # Preprocess the events (this should match your training process)
        X = preprocess_events(current_events)

        # Log the shape of the preprocessed data to verify the feature count
        logging.info(f"Shape of preprocessed data: {X.shape}")

        # Get prediction probabilities
        proba = model.predict_proba(X)

        # Assuming two teams, return probabilities for substitutions for each team
        result = {
            "team_1_substitution_probability": proba[0][1],  # Probability for the first team
            "team_2_substitution_probability": proba[1][1]   # Probability for the second team
        }

        return jsonify(result)

    except Exception as e:
        logging.error(f"Error during prediction: {e}")
        return jsonify({"error": str(e)}), 500

def preprocess_events(events):
    # Convert the event data into the format your model expects
    try:
        # Initialize a list for storing features
        features = []

        for event in events:
            # Extract relevant features that match the model's training process
            # Adjust this to match exactly the features your model was trained on

            event_type = event['type'].get('name', 'Unknown')
            minute = event.get('minute', 0)
            second = event.get('second', 0)
            team = event['team'].get('name', 'Unknown')

            # Assuming that only 4 features were used in training, select only 4
            feature_vector = [
                minute,             # Numeric feature
                second,             # Numeric feature
                len(event_type),    # Example of a categorical feature converted to numeric
                len(team)           # Another categorical feature converted to numeric
            ]

            # Append the feature vector to the list
            features.append(feature_vector)

        # Convert the list to a numpy array for model input
        X = np.array(features)

        # Log the feature vectors for debugging
        logging.debug(f"Feature vectors: {X}")

        return X

    except KeyError as e:
        logging.error(f"Missing expected feature: {e}")
        raise e
    except Exception as e:
        logging.error(f"Error in preprocessing: {e}")
        raise e

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080, debug=True)

Overwriting app.py


In [None]:
%%writefile Dockerfile
# Use an official Python runtime as a parent image
FROM python:3.8-slim

# Set the working directory
WORKDIR /app

# Copy the current directory contents into the container at /app
COPY . /app

# Install any necessary dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Make port 8080 available to the world outside this container
EXPOSE 8080

# Define environment variable
ENV NAME World

# Run app.py when the container launches
CMD ["python", "app.py"]

Writing Dockerfile


In [None]:
%%writefile requirements.txt
Flask
joblib
numpy
scikit-learn
flask-cors

Overwriting requirements.txt


In [None]:
%%writefile .dockerignore
# Ignore Python cache files
__pycache__/
*.pyc

# Ignore specific directories
sample_data
open-data

# Ignore any temporary files or directories
*.tmp
*.log
temp/

Overwriting .dockerignore


In [None]:
!gcloud auth login

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=QdgxOTGQqE4AUtZ3tjYNJO17JzfGdF&prompt=consent&token_usage=remote&access_type=offline&code_challenge=qhnaPIEIyCyqAs2olsBV9EXgWsGD5IaNhXlHbxq0Ml0&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0AQlEd8xKKk9o3lK0Mc27j9vEZee9i65HRgH3c-KbRHuvb-cLrioRW-L07yR-jVs5lY_2sw

You are now logged in as [thomasgeorgepasley@gmail.com].
Your current p

In [None]:
!gcloud config set project polar-ensign-432610-t7

Updated property [core/project].


In [None]:
%cd /content/app

/content/app


In [None]:
!ls

app.py	Dockerfile  requirements.txt  substitution_model.pkl


In [None]:
!gcloud builds submit --tag gcr.io/polar-ensign-432610-t7/substitution-model

Creating temporary archive of 4 file(s) totalling 352.4 KiB before compression.
Uploading tarball of [.] to [gs://polar-ensign-432610-t7_cloudbuild/source/1724856894.389428-08cc073ad7fd4e1288338d42793a56d3.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/polar-ensign-432610-t7/locations/global/builds/65bce125-93d4-4ce1-8d27-08c6aabb7887].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/65bce125-93d4-4ce1-8d27-08c6aabb7887?project=116084333061 ].
Waiting for build to complete. Polling interval: 1 second(s).
 REMOTE BUILD OUTPUT
starting build "65bce125-93d4-4ce1-8d27-08c6aabb7887"

FETCHSOURCE
Fetching storage object: gs://polar-ensign-432610-t7_cloudbuild/source/1724856894.389428-08cc073ad7fd4e1288338d42793a56d3.tgz#1724856895278207
Copying gs://polar-ensign-432610-t7_cloudbuild/source/1724856894.389428-08cc073ad7fd4e1288338d42793a56d3.tgz#1724856895278207...
/ [1 files][ 62.3 KiB/ 62.3 KiB]                                                
Operatio

In [None]:
!gcloud run deploy substitution-model-service \
--image gcr.io/polar-ensign-432610-t7/substitution-model \
--platform managed \
--region eu-west2 \
--allow-unauthenticated
