In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report
import json
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt

In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

## Process Dataset

### Load Dataset

In [3]:
DATA_PATH = "/kaggle/input/soccer-outcome"
train_path = os.path.join(DATA_PATH, "concatenated_train.json")
test_path = os.path.join(DATA_PATH, "concatenated_test.json")

In [4]:
df_train = pd.read_json(train_path, orient="table")
df_test = pd.read_json(test_path, orient="table")

In [5]:
df_train

Unnamed: 0,EventId,PlayerId,PlayerWeight,PlayerHeight,FavorableFoot,XStart,XEnd,YStart,YEnd,Tags,Goal,MatchId,TeamId,MatchPeriod,EventSec,SubEventId,Id,PlayerSide,inGoal
0,10,25413,73,175,right,88,0,41,0,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",1,2499719,1609,1H,94.595788,100,177959212,1.0,0
1,9,8480,76,185,right,100,12,100,59,"[{'id': 101}, {'id': 1205}, {'id': 1802}]",1,2499719,1631,1H,96.970614,90,177959226,0.0,0
2,3,14853,89,187,left,94,93,0,41,[{'id': 1801}],0,2499719,1631,1H,175.308128,36,177959244,0.0,0
3,10,26150,62,179,left,85,100,52,100,"[{'id': 401}, {'id': 201}, {'id': 1211}, {'id'...",0,2499719,1631,1H,179.854785,100,177959247,0.0,0
4,3,7882,90,196,left,0,28,0,44,[],0,2499719,1609,1H,196.066859,34,177959227,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215514,3,3529,74,182,right,42,24,100,74,[{'id': 1801}],0,2565927,682,2H,2773.540076,36,253302621,1.0,0
215515,3,3310,80,174,left,54,37,0,11,[{'id': 1801}],0,2565927,675,2H,2806.801410,36,253302668,0.0,0
215516,3,14723,76,183,right,100,88,100,43,"[{'id': 302}, {'id': 801}, {'id': 1801}]",0,2565927,675,2H,2853.997053,30,253302703,0.0,0
215517,3,3486,82,185,right,0,7,0,16,[],0,2565927,682,2H,2873.020476,34,253302649,1.0,0


## Feature engineering

### Calculate shot distance and shot angle

In [6]:
def get_shot_distance(x,y):
    return np.sqrt(np.power((100-x)*105/100, 2) + np.power((50-y)*68/100, 2))

def get_shot_angle(x, y, tol=1e-6):
    # Translate coordinates to meters using average field dimensions of 105x68
    x_meters = x * 105 / 100
    y_meters = y * 68 / 100

    # Calculate squared distances
    dist1_squared = (105 - x_meters) ** 2 + (34 + 7.32 / 2 - y_meters) ** 2
    dist2_squared = (105 - x_meters) ** 2 + (34 - 7.32 / 2 - y_meters) ** 2
    goal_width_squared = 7.32 ** 2

    # Calculate distances
    dist1 = np.sqrt(dist1_squared)
    dist2 = np.sqrt(dist2_squared)

    # Calculate the angle in radians
    numerator = dist1_squared + dist2_squared - goal_width_squared
    denominator = 2 * dist1 * dist2
    cos_theta = max(-1, min(1, numerator / denominator))
    angle_radians = np.arccos(cos_theta)
    # Translate radians to degrees
    angle_degrees = angle_radians * 180 / np.arccos(-1)

    return angle_degrees


In [7]:
df_train["ShotDistance"] = df_train.apply(lambda x: get_shot_distance(x["XStart"], x["YStart"]), axis=1)
df_test["ShotDistance"] = df_test.apply(lambda x: get_shot_distance(x["XStart"], x["YStart"]), axis=1)
df_train["ShotAngle"] = df_train.apply(lambda x: get_shot_angle(x["XStart"], x["YStart"]), axis=1)
df_test["ShotAngle"] = df_test.apply(lambda x: get_shot_angle(x["XStart"], x["YStart"]), axis=1)

In [8]:
df_train

Unnamed: 0,EventId,PlayerId,PlayerWeight,PlayerHeight,FavorableFoot,XStart,XEnd,YStart,YEnd,Tags,...,MatchId,TeamId,MatchPeriod,EventSec,SubEventId,Id,PlayerSide,inGoal,ShotDistance,ShotAngle
0,10,25413,73,175,right,88,0,41,0,"[{'id': 101}, {'id': 402}, {'id': 201}, {'id':...",...,2499719,1609,1H,94.595788,100,177959212,1.0,0,14.007655,26.770923
1,9,8480,76,185,right,100,12,100,59,"[{'id': 101}, {'id': 1205}, {'id': 1802}]",...,2499719,1631,1H,96.970614,90,177959226,0.0,0,34.000000,0.000001
2,3,14853,89,187,left,94,93,0,41,[{'id': 1801}],...,2499719,1631,1H,175.308128,36,177959244,0.0,0,34.578751,2.233719
3,10,26150,62,179,left,85,100,52,100,"[{'id': 401}, {'id': 201}, {'id': 1211}, {'id'...",...,2499719,1631,1H,179.854785,100,177959247,0.0,0,15.808608,25.986925
4,3,7882,90,196,left,0,28,0,44,[],...,2499719,1609,1H,196.066859,34,177959227,1.0,0,110.367568,3.614442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215514,3,3529,74,182,right,42,24,100,74,[{'id': 1801}],...,2565927,682,2H,2773.540076,36,253302621,1.0,0,69.748190,5.250066
215515,3,3310,80,174,left,54,37,0,11,[{'id': 1801}],...,2565927,675,2H,2806.801410,36,253302668,0.0,0,59.066827,5.808618
215516,3,14723,76,183,right,100,88,100,43,"[{'id': 302}, {'id': 801}, {'id': 1801}]",...,2565927,675,2H,2853.997053,30,253302703,0.0,0,34.000000,0.000001
215517,3,3486,82,185,right,0,7,0,16,[],...,2565927,682,2H,2873.020476,34,253302649,1.0,0,110.367568,3.614442


### Define features and undersampling

In [9]:
input_features = ['XStart', 'YStart', "ShotDistance", "ShotAngle", "PlayerWeight", "PlayerHeight", "FavorableFoot",
                  "EventId", "PlayerId"]  # Adjust as needed
label_feature = 'Goal'

# "PlayerWeight", "PlayerHeight", "FavorableFoot"
# 'PlayerSide'

In [10]:
# Assuming df is your DataFrame and 'label' is the column with class labels
class_0_train, class_0_test = df_train[df_train[label_feature] == 0], df_test[df_test[label_feature] == 0]
class_1_train, class_1_test = df_train[df_train[label_feature] == 1], df_test[df_test[label_feature] == 1]

# Find the number of samples in the minority class
n_samples_train = min(len(class_0_train), len(class_1_train))
n_samples_test = min(len(class_0_test), len(class_1_test))

# Undersample the majority class
class_0_undersampled_train, class_0_undersampled_test = class_0_train.sample(n_samples_train), class_0_test.sample(n_samples_test)
class_1_undersampled_train, class_1_undersampled_test = class_1_train.sample(n_samples_train), class_1_test.sample(n_samples_test)

# Combine the undersampled data
df_balanced_train = pd.concat([class_0_undersampled_train, class_1_undersampled_train])
df_balanced_test = pd.concat([class_0_undersampled_test, class_1_undersampled_test])

# Shuffle the DataFrame
df_balanced_train = df_balanced_train.sample(frac=1).reset_index(drop=True)
df_balanced_test= df_balanced_test.sample(frac=1).reset_index(drop=True)

print(df_balanced_train[label_feature].value_counts())
print(df_balanced_test[label_feature].value_counts())

Goal
0    9191
1    9191
Name: count, dtype: int64
Goal
0    598
1    598
Name: count, dtype: int64


In [11]:
# input_features = ["XStart", "YStart", "ShotDistance", "ShotAngle", "PlayerWeight", "PlayerHeight", "EventId", "PlayerId"]

In [12]:
# import seaborn as sns
# # Let's also draw a heatmap visualization of the correlation matrix
# corr_matrix = df_train[input_features].corr(method='spearman')
# f, ax = plt.subplots(figsize=(16,8))
# sns.heatmap(corr_matrix, annot=True, fmt='.2f', linewidth=0.4,
#             annot_kws={"size": 10}, cmap='coolwarm', ax=ax)
# plt.xticks(fontsize=10)
# plt.yticks(fontsize=10)
# plt.show()

### Transform

In [15]:
X_train, X_test = df_balanced_train[input_features], df_balanced_test[input_features]
y_train, y_test = df_balanced_train[label_feature], df_balanced_test[label_feature]

# 2. Encode Categorical Features:
categorical_features = ["FavorableFoot", "EventId", "PlayerId"]  # Identify categorical columns
# categorical_features = ["EventId", "PlayerId"]  # Identify categorical columns

numeric_features = ['XStart', 'YStart', "ShotDistance","ShotAngle", "PlayerWeight", "PlayerHeight"]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OrdinalEncoder(), categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [16]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((18382, 9), (18382,), (1196, 9), (1196,))

## Training Models

### Logistic Regression

In [22]:
from sklearn.linear_model import RidgeClassifier

In [23]:
model_logistic_regression = LogisticRegression(solver="newton-cg")
# model_logistic_regression = RidgeClassifier(alpha=0.8)
model_logistic_regression.fit(X_train, y_train)

In [30]:
y_pred = model_logistic_regression.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.845     0.831     0.838       598
           1      0.834     0.848     0.841       598

    accuracy                          0.839      1196
   macro avg      0.840     0.839     0.839      1196
weighted avg      0.840     0.839     0.839      1196



### Random forest

In [25]:
model_random_forest = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=3, verbose=True)
model_random_forest.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.5s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    1.1s finished


In [31]:
y_pred = model_random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=3)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

Accuracy: 0.89
Classification Report:
               precision    recall  f1-score   support

           0      0.927     0.851     0.888       598
           1      0.862     0.933     0.896       598

    accuracy                          0.892      1196
   macro avg      0.895     0.892     0.892      1196
weighted avg      0.895     0.892     0.892      1196



[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished


### XGBoost

In [27]:
# 1. Initialize the XGBoost Classifier
model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
# For multi-class classification, use objective='multi:softprob'

# 2. Optional: Hyperparameter Tuning
# (Uncomment and adjust parameters as needed)
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200]
}
grid_search = GridSearchCV(model, params, cv=3)
grid_search.fit(X_train, y_train, verbose = True)

In [28]:
model = grid_search.best_estimator_

# 3. Train the model
model.fit(X_train, y_train, verbose=True)

In [32]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=3)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

Accuracy: 0.90
Classification Report:
               precision    recall  f1-score   support

           0      0.946     0.846     0.893       598
           1      0.861     0.952     0.904       598

    accuracy                          0.899      1196
   macro avg      0.903     0.899     0.899      1196
weighted avg      0.903     0.899     0.899      1196



### LSTM

In [17]:
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

y_train_lstm = y_train.values.reshape(-1, 1)
y_test_lstm = y_test.values.reshape(-1, 1)

In [18]:
# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(1, X_train_lstm.shape[2])))
model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(**kwargs)


In [19]:
# Train the model
history = model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=32, validation_data=(X_test_lstm, y_test_lstm))

Epoch 1/50
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5998 - loss: 0.6694 - val_accuracy: 0.7074 - val_loss: 0.5280
Epoch 2/50
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7811 - loss: 0.4911 - val_accuracy: 0.8161 - val_loss: 0.4872
Epoch 3/50
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8085 - loss: 0.4500 - val_accuracy: 0.8370 - val_loss: 0.4496
Epoch 4/50
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8455 - loss: 0.4025 - val_accuracy: 0.8579 - val_loss: 0.4030
Epoch 5/50
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8614 - loss: 0.3721 - val_accuracy: 0.8503 - val_loss: 0.3907
Epoch 6/50
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8644 - loss: 0.3639 - val_accuracy: 0.8679 - val_loss: 0.3624
Epoch 7/50
[1m575/575[0m 

In [21]:
y_pred = (model.predict(X_test_lstm) > 0.5).astype("int32")
print(classification_report(y_test_lstm, y_pred, target_names=['0', '1'], digits=4))

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
              precision    recall  f1-score   support

           0     0.9482    0.8261    0.8829       598
           1     0.8459    0.9548    0.8971       598

    accuracy                         0.8905      1196
   macro avg     0.8971    0.8905    0.8900      1196
weighted avg     0.8971    0.8905    0.8900      1196

