In [14]:
# import libraries

import pandas as pd
import numpy as np
import tensorflow as tf

In [15]:
# start the spark server

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('goal_model').getOrCreate()

In [16]:
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, BooleanType, StringType

df = spark.read.csv("../data/processed/collection/dataCollect.csv", header=True)
df = df.withColumn("isGoal", col("isGoal").cast(BooleanType())).withColumn("shotDistance", col("shotDistance").cast(FloatType())) \
    .withColumn("shotAngle", col("shotAngle").cast(FloatType()))
df.show()

+--------+------------+---------+------+
|shotType|shotDistance|shotAngle|isGoal|
+--------+------------+---------+------+
|    Shot|   14.007655|26.770924|  true|
|    Shot|   15.808608|25.986925| false|
|    Shot|    4.414703|  78.7883|  true|
|    Shot|   23.057236|15.737205| false|
|    Shot|   29.563871|12.585866| false|
|    Shot|   12.891873|26.702152|  true|
|    Shot|    33.91361| 11.83811|  true|
|    Shot|   15.674297|24.855238|  true|
|    Shot|   16.126612|  17.3064| false|
|    Shot|   29.905733|13.255181| false|
|    Shot|   6.8620987| 53.84777|  true|
|    Shot|   10.007677|35.326347| false|
|    Shot|    30.17615|13.488423|  true|
|    Shot|   13.717584|29.756178|  true|
|    Shot|         8.4| 47.08689|  true|
|    Shot|   31.903494|12.511723| false|
|    Shot|         8.4| 47.08689| false|
|    Shot|   26.953978|15.080159| false|
|    Shot|    8.829406|43.602642|  true|
|    Shot|   24.388163|16.912554| false|
+--------+------------+---------+------+
only showing top

In [17]:
# convert df to pandas

pandas_df = df.toPandas()
pandas_df = pd.get_dummies(pandas_df, columns=['shotType'])
pandas_df['isGoal'] = pandas_df['isGoal'].astype(int)
pandas_df['shotType_Free kick shot'] = pandas_df['shotType_Free kick shot'].astype(int)
pandas_df['shotType_Penalty'] = pandas_df['shotType_Penalty'].astype(int)
pandas_df['shotType_Shot'] = pandas_df['shotType_Shot'].astype(int)
pandas_df.dtypes

shotDistance               float32
shotAngle                  float32
isGoal                       int32
shotType_Free kick shot      int32
shotType_Penalty             int32
shotType_Shot                int32
dtype: object

In [18]:
# Shuffle and split the dataset

dataset = pandas_df.sample(frac=1) # shuffle the dataset

# Get the feature and output column
features_df = dataset[["shotType_Free kick shot", "shotType_Penalty", "shotType_Shot", "shotDistance", "shotAngle"]]
output_df = dataset["isGoal"]

train_size = int(0.7 * len(pandas_df))


X_train, X_test = features_df[:train_size], features_df[train_size:]
y_train, y_test = output_df[:train_size], output_df[train_size:]

In [19]:
X_train

Unnamed: 0,shotType_Free kick shot,shotType_Penalty,shotType_Shot,shotDistance,shotAngle
5584,0,0,1,97.460182,4.263505
6865,0,0,1,66.647797,6.141677
2992,0,0,1,67.820526,6.027009
4033,0,0,1,45.569534,8.475406
2187,0,0,1,46.811115,7.638151
...,...,...,...,...,...
5014,0,0,1,24.167747,3.083666
4085,0,0,1,76.250000,4.922526
8291,0,0,1,47.069126,6.175329
2122,0,0,1,57.816822,5.930836


In [20]:
# Define the logistic regression model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[1],)),  
    tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
# Compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

In [22]:
# Train the model
model.fit(X_train, y_train, epochs=32)

Epoch 1/32
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5577 - loss: 1.7602
Epoch 2/32
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5887 - loss: 0.6822
Epoch 3/32
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5891 - loss: 0.6787
Epoch 4/32
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5999 - loss: 0.6744
Epoch 5/32
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5923 - loss: 0.6762
Epoch 6/32
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5984 - loss: 0.6740
Epoch 7/32
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5966 - loss: 0.6745
Epoch 8/32
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6007 - loss: 0.6729
Epoch 9/32
[1m195/195[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x1dc75049180>

In [23]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy:.4f}')
print(f'Loss: {loss:.4f}')

[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5939 - loss: 0.6754
Accuracy: 0.5980
Loss: 0.6738


In [24]:
# required shape [<KerasTensor shape=(None, 5), dtype=float32, sparse=None, name=keras_tensor_16>]

predict_df = pd.DataFrame(np.array([[0,1,0, 80.673714, 4.937416]]), columns=["shotType_Free kick shot", "shotType_Penalty", "shotType_Shot", "shotDistance", "shotAngle"], index=(1,))

# predict_df['shotType_Free kick shot'] = predict_df['shotType_Free kick shot'].astype(int)
# predict_df['shotType_Penalty'] = predict_df['shotType_Penalty'].astype(int)
# predict_df['shotType_Shot'] = predict_df['shotType_Shot'].astype(int)
# predict_df['shotDistance'] = predict_df['shotDistance'].astype('float32')
# predict_df['shotAngle'] = predict_df['shotAngle'].astype('float32')

final_predictions = model.predict(predict_df)
predict_df

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step


Unnamed: 0,shotType_Free kick shot,shotType_Penalty,shotType_Shot,shotDistance,shotAngle
1,0.0,1.0,0.0,80.673714,4.937416


In [25]:
final_predictions

array([[0.40398458]], dtype=float32)