In [111]:
# import libraries

import pandas as pd
import numpy as np
import tensorflow as tf

In [112]:
# start the spark server

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('goal_model').getOrCreate()

In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, BooleanType, StringType

df = spark.read.csv("collection/dataCollect.csv", header=True)
df = df.withColumn("isGoal", col("isGoal").cast(BooleanType())).withColumn("shotDistance", col("shotDistance").cast(FloatType())) \
    .withColumn("shotAngle", col("shotAngle").cast(FloatType()))
df.show()

In [213]:
# convert df to pandas

pandas_df = df.toPandas()
pandas_df = pd.get_dummies(pandas_df, columns=['shotType'])
pandas_df['isGoal'] = pandas_df['isGoal'].astype(int)
pandas_df['shotType_Free kick shot'] = pandas_df['shotType_Free kick shot'].astype(int)
pandas_df['shotType_Penalty'] = pandas_df['shotType_Penalty'].astype(int)
pandas_df['shotType_Shot'] = pandas_df['shotType_Shot'].astype(int)
pandas_df.dtypes

shotDistance               float32
shotAngle                  float32
isGoal                       int32
shotType_Free kick shot      int32
shotType_Penalty             int32
shotType_Shot                int32
dtype: object

In [130]:
# Shuffle and split the dataset

dataset = pandas_df.sample(frac=1) # shuffle the dataset

# Get the feature and output column
features_df = dataset[["shotType_Free kick shot", "shotType_Penalty", "shotType_Shot", "shotDistance", "shotAngle"]]
output_df = dataset["isGoal"]

train_size = int(0.7 * len(pandas_df))


X_train, X_test = features_df[:train_size], features_df[train_size:]
y_train, y_test = output_df[:train_size], output_df[train_size:]

In [261]:
X_train

Unnamed: 0,shotType_Free kick shot,shotType_Penalty,shotType_Shot,shotDistance,shotAngle
3708,0,0,1,52.500000,7.975764
622,0,0,1,72.768417,5.733676
1939,0,0,1,70.553879,5.482955
7648,0,0,1,18.899843,13.815485
617,0,0,1,102.408791,3.862184
...,...,...,...,...,...
1249,0,0,1,58.006100,7.059862
3222,0,0,1,36.475201,11.222692
4018,0,0,1,63.414448,6.346617
7937,0,0,1,44.541416,7.114545


In [132]:
# Define the logistic regression model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[1],)),  
    tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [133]:
# Compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=32)

In [239]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy:.4f}')
print(f'Loss: {loss:.4f}')

[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6206 - loss: 0.6656
Accuracy: 0.6081
Loss: 0.6702


In [289]:
# required shape [<KerasTensor shape=(None, 5), dtype=float32, sparse=None, name=keras_tensor_16>]

predict_df = pd.DataFrame(np.array([[0,1,0, 80.673714, 4.937416]]), columns=["shotType_Free kick shot", "shotType_Penalty", "shotType_Shot", "shotDistance", "shotAngle"], index=(1,))

# predict_df['shotType_Free kick shot'] = predict_df['shotType_Free kick shot'].astype(int)
# predict_df['shotType_Penalty'] = predict_df['shotType_Penalty'].astype(int)
# predict_df['shotType_Shot'] = predict_df['shotType_Shot'].astype(int)
# predict_df['shotDistance'] = predict_df['shotDistance'].astype('float32')
# predict_df['shotAngle'] = predict_df['shotAngle'].astype('float32')

final_predictions = model.predict(predict_df)
predict_df

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


Unnamed: 0,shotType_Free kick shot,shotType_Penalty,shotType_Shot,shotDistance,shotAngle
1,0.0,1.0,0.0,80.673714,4.937416


In [290]:
final_predictions

array([[0.40887013]], dtype=float32)

In [291]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

tree_dataset = dataset.dropna()

# Get the feature and output column
tree_features_df = tree_dataset[["shotType_Free kick shot", "shotType_Penalty", "shotType_Shot", "shotDistance", "shotAngle"]]
tree_output_df = tree_dataset["isGoal"]


tree_dataset= tree_dataset.astype(int)
Xt_train, Xt_test = tree_features_df[:train_size], tree_features_df[train_size:]
yt_train, yt_test = tree_output_df[:train_size], tree_output_df[train_size:]

# Train the classifier on your data
clf.fit(Xt_train, yt_train)

# Make predictions on new data
predictions = clf.predict(predict_df)
predictions
# print(Xt_test.iloc[12])
# Evaluate the model performance
# from sklearn.metrics import accuracy_score
# accuracy = accuracy_score(yt_test, predictions)
# print("Accuracy:", accuracy)

array([1])