In [None]:
!git clone https://github.com/statsbomb/open-data.git

Cloning into 'open-data'...
remote: Enumerating objects: 49843, done.[K
remote: Counting objects: 100% (5351/5351), done.[K
remote: Compressing objects: 100% (1332/1332), done.[K
remote: Total 49843 (delta 5243), reused 4097 (delta 3999), pack-reused 44492 (from 1)[K
Receiving objects: 100% (49843/49843), 6.45 GiB | 17.67 MiB/s, done.
Resolving deltas: 100% (46913/46913), done.
Updating files: 100% (7246/7246), done.


In [None]:
!pip install google-cloud-bigquery pandas scikit-learn Flask joblib flask-cors statsbombpy

Collecting flask-cors
  Downloading Flask_Cors-4.0.1-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting statsbombpy
  Downloading statsbombpy-1.13.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests-cache (from statsbombpy)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache->statsbombpy)
  Downloading cattrs-24.1.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache->statsbombpy)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading Flask_Cors-4.0.1-py2.py3-none-any.whl (14 kB)
Downloading statsbombpy-1.13.1-py3-none-any.whl (16 kB)
Downloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-24.1.0-p

In [None]:
import os
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Path to the directory containing the event files
event_files_path = '/content/open-data/data/events'

# Function to load and process each event file
def process_event_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        events = json.load(f)
    return pd.json_normalize(events)

# Sample a fraction of the event data (e.g., 10%)
sampling_fraction = 0.1

# Load and process event data files
all_event_data = []
for i, file_name in enumerate(os.listdir(event_files_path)):
    if file_name.endswith('.json'):
        file_path = os.path.join(event_files_path, file_name)
        event_df = process_event_file(file_path)

        # Randomly sample a fraction of the data
        sampled_df = event_df.sample(frac=sampling_fraction, random_state=42)
        all_event_data.append(sampled_df)

        # Optionally, break early to limit the data size
        if i >= 20:  # Adjust this limit based on RAM availability
            break

# Concatenate all event data into a single DataFrame
all_event_data_df = pd.concat(all_event_data, ignore_index=True)
print(f"Total events loaded: {len(all_event_data_df)}")

# Function to create features for model training
def create_features(event_df):
    # Initialize a dictionary to store player statistics
    player_stats = {}

    # Loop through each event to accumulate statistics
    for idx, row in event_df.iterrows():
        player_id = row.get('player.id')
        if not player_id:
            continue  # Skip events without a player

        if player_id not in player_stats:
            player_stats[player_id] = {'total_passes': 0, 'total_shots': 0, 'total_fouls': 0}

        event_type = row['type.name']
        if event_type == 'Pass':
            player_stats[player_id]['total_passes'] += 1
        elif event_type == 'Shot':
            player_stats[player_id]['total_shots'] += 1
        elif event_type == 'Foul Committed':
            player_stats[player_id]['total_fouls'] += 1

    # Create feature vectors for the model
    features = []
    labels = []

    for idx, row in event_df.iterrows():
        player_id = row.get('player.id')
        if player_id and player_id in player_stats:
            feature_vector = [
                row.get('minute', 0),  # Numeric feature
                player_stats[player_id]['total_passes'],  # Numeric feature
                player_stats[player_id]['total_shots'],  # Numeric feature
                player_stats[player_id]['total_fouls']   # Numeric feature
            ]
            features.append(feature_vector)

            # Label: 1 if the event is a 'Tactical Shift', else 0
            label = 1 if row['type.name'] == 'Tactical Shift' else 0
            labels.append(label)

    return np.array(features), np.array(labels)

# Apply feature creation to the DataFrame
X, y = create_features(all_event_data_df)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a DecisionTreeClassifier model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy for 'Tactical Shift' Prediction:", accuracy)

# Save the model for later use
import joblib
joblib.dump(model, 'tactical_shift_model.pkl')

Total events loaded: 7677
Model Accuracy for 'Tactical Shift' Prediction: 1.0


['tactical_shift_model.pkl']

In [None]:
%cd /content/app

/content/app


In [None]:
# Save the trained model
import joblib
joblib.dump(model, 'tactical_shift_model.pkl')

['tactical_shift_model.pkl']

In [None]:
%%writefile app.py
from flask import Flask, request, jsonify
import joblib
import numpy as np
from flask_cors import CORS
import logging

# Setup logging
logging.basicConfig(level=logging.DEBUG)

# Load the trained model
try:
    model = joblib.load('tactical_shift_model.pkl')
    logging.info("Model loaded successfully.")
except Exception as e:
    logging.error(f"Error loading model: {e}")

app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

@app.route('/predict-tactical-shift', methods=['POST'])
def predict_tactical_shift():
    if request.method == 'OPTIONS':
        response = jsonify({})
        response.headers.add("Access-Control-Allow-Origin", "*")
        response.headers.add("Access-Control-Allow-Headers", "Content-Type")
        response.headers.add("Access-Control-Allow-Methods", "POST, OPTIONS")
        return response

    try:
        data = request.get_json()
        current_events = data.get("events")

        # Preprocess the events (this should match your training process)
        X = preprocess_events(current_events)

        # Debugging: Print out the shape of X to ensure it is as expected
        logging.debug(f"Input features shape: {X.shape}")

        # Get prediction probabilities
        proba = model.predict_proba(X)

        # Debugging: Print out the proba array to understand what predictions are being made
        logging.debug(f"Prediction probabilities: {proba}")

        # Handle the case where only one probability is returned for each prediction
        if proba.shape[1] == 1:
            # Binary classification: only one probability returned (for the positive class)
            result = {
                "team_1_tactical_shift_probability": proba[0][0],  # Probability for the first team
                "team_2_tactical_shift_probability": proba[1][0] if len(proba) > 1 else None  # Check if there's a second team
            }
        else:
            # If two probabilities are returned (for both classes)
            result = {
                "team_1_tactical_shift_probability": proba[0][1],  # Probability for the first team
                "team_2_tactical_shift_probability": proba[1][1] if len(proba) > 1 else None  # Check if there's a second team
            }

        return jsonify(result)

    except Exception as e:
        logging.error(f"Error during prediction: {e}")
        return jsonify({"error": str(e)}), 500



import numpy as np
import logging

def preprocess_events(events):
    # Convert the event data into the format your model expects
    try:
        # Initialize a list for storing features
        features = []

        for event in events:
            # Extract relevant features that match the model's training process
            # Adjust this to match exactly the features your model was trained on

            event_type = event['type'].get('name', 'Unknown')
            minute = event.get('minute', 0)
            second = event.get('second', 0)
            team = event['team'].get('name', 'Unknown')

            # Assuming that only 4 features were used in training, select only 4
            feature_vector = [
                minute,             # Numeric feature
                second,             # Numeric feature
                len(event_type),    # Example of a categorical feature converted to numeric
                len(team)           # Another categorical feature converted to numeric
            ]

            # Append the feature vector to the list
            features.append(feature_vector)

        # Convert the list to a numpy array for model input
        X = np.array(features)

        # Ensure the array is 2D even if there's only one sample
        if X.ndim == 1:
            X = X.reshape(1, -1)

        # Log the feature vectors for debugging
        logging.debug(f"Feature vectors: {X}")

        return X

    except KeyError as e:
        logging.error(f"Missing expected feature: {e}")
        raise e
    except Exception as e:
        logging.error(f"Error in preprocessing: {e}")
        raise e


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080, debug=True)

Overwriting app.py


In [None]:
%%writefile Dockerfile
# Use an official Python runtime as a parent image
FROM python:3.8-slim

# Set the working directory in the container
WORKDIR /app

# Copy the current directory contents into the container
COPY . /app

# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Expose port 8080 for Flask
EXPOSE 8080

# Define environment variable for Flask
ENV FLASK_APP=app.py
ENV FLASK_RUN_HOST=0.0.0.0
ENV FLASK_ENV=production

# Run the Flask app
CMD ["flask", "run", "--port=8080"]


Overwriting Dockerfile


In [None]:
%%writefile requirements.txt
flask
flask-cors
joblib
numpy
pandas
scikit-learn

Overwriting requirements.txt


In [None]:
!gcloud auth login

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=kQPUL0V3CLxUEqgmYIzzuiYWQ9PZfa&prompt=consent&token_usage=remote&access_type=offline&code_challenge=Xfm9UmpOIW12UYBD6Vz__0Aja0nQ5Uf9vi0nIPRxoLY&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0AQlEd8yaDOe_TflaRV1z0o23zU0uLeoN4e84tCOgdbdsJKcknCoCbo6DUrXG5PQAiooJHw

You are now logged in as [thomasgeorgepasley@gmail.com].
Your current p

In [None]:
!gcloud config set project polar-ensign-432610-t7

Updated property [core/project].


In [None]:
!gcloud builds submit --tag gcr.io/polar-ensign-432610-t7/tactical-model

Creating temporary archive of 4 file(s) totalling 6.0 KiB before compression.
Uploading tarball of [.] to [gs://polar-ensign-432610-t7_cloudbuild/source/1724870649.343608-89d80a9e360440a895d6aba15feea0a9.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/polar-ensign-432610-t7/locations/global/builds/9dc9682f-c737-48b8-a027-a8dbc0a1e992].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/9dc9682f-c737-48b8-a027-a8dbc0a1e992?project=116084333061 ].
Waiting for build to complete. Polling interval: 1 second(s).
 REMOTE BUILD OUTPUT
starting build "9dc9682f-c737-48b8-a027-a8dbc0a1e992"

FETCHSOURCE
Fetching storage object: gs://polar-ensign-432610-t7_cloudbuild/source/1724870649.343608-89d80a9e360440a895d6aba15feea0a9.tgz#1724870650013844
Copying gs://polar-ensign-432610-t7_cloudbuild/source/1724870649.343608-89d80a9e360440a895d6aba15feea0a9.tgz#1724870650013844...
/ [1 files][  2.9 KiB/  2.9 KiB]                                                
Operation 