In [1]:
import pandas as pd
import numpy as np

In [2]:
# start the spark server

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('goal_model').getOrCreate()

In [3]:
# Casting proper datatype to each column

from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, BooleanType, StringType

df = spark.read.csv("../collection/dataCollect.csv", header=True)
df = df.withColumn("isGoal", col("isGoal").cast(BooleanType())) \
    .withColumn("shotDistance", col("shotDistance").cast(FloatType())) \
    .withColumn("shotAngle", col("shotAngle").cast(FloatType()))
df.show()

+--------+------------+---------+------+
|shotType|shotDistance|shotAngle|isGoal|
+--------+------------+---------+------+
|    Shot|   14.007655|26.770924|  true|
|    Shot|   15.808608|25.986925| false|
|    Shot|    4.414703|  78.7883|  true|
|    Shot|   23.057236|15.737205| false|
|    Shot|   29.563871|12.585866| false|
|    Shot|   12.891873|26.702152|  true|
|    Shot|    33.91361| 11.83811|  true|
|    Shot|   15.674297|24.855238|  true|
|    Shot|   16.126612|  17.3064| false|
|    Shot|   29.905733|13.255181| false|
|    Shot|   6.8620987| 53.84777|  true|
|    Shot|   10.007677|35.326347| false|
|    Shot|    30.17615|13.488423|  true|
|    Shot|   13.717584|29.756178|  true|
|    Shot|         8.4| 47.08689|  true|
|    Shot|   31.903494|12.511723| false|
|    Shot|         8.4| 47.08689| false|
|    Shot|   26.953978|15.080159| false|
|    Shot|    8.829406|43.602642|  true|
|    Shot|   24.388163|16.912554| false|
+--------+------------+---------+------+
only showing top

In [4]:
# convert df to pandas

pandas_df = df.toPandas()
pandas_df = pd.get_dummies(pandas_df, columns=['shotType'])
pandas_df['isGoal'] = pandas_df['isGoal'].astype(int)
pandas_df['shotType_Free kick shot'] = pandas_df['shotType_Free kick shot'].astype(int)
pandas_df['shotType_Penalty'] = pandas_df['shotType_Penalty'].astype(int)
pandas_df['shotType_Shot'] = pandas_df['shotType_Shot'].astype(int)

In [7]:
# Shuffle and split the dataset

dataset = pandas_df.sample(frac=1) # shuffle the dataset
train_size = int(0.7 * len(pandas_df))

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

tree_dataset = dataset.dropna()

# Get the feature and output column
tree_features_df = tree_dataset[["shotType_Free kick shot", "shotType_Penalty", "shotType_Shot", "shotDistance", "shotAngle"]]
tree_output_df = tree_dataset["isGoal"]


tree_dataset= tree_dataset.astype(int)
Xt_train, Xt_test = tree_features_df[:train_size], tree_features_df[train_size:]
yt_train, yt_test = tree_output_df[:train_size], tree_output_df[train_size:]

In [9]:

# Train the classifier on your data
clf.fit(Xt_train, yt_train)

In [10]:

# Make predictions on new data
predictions = clf.predict(Xt_test)

In [11]:
# Evaluate the model performance
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(yt_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.6173814898419865


In [12]:
# Predict on new data:
predict_df = pd.DataFrame(np.array([[0,1,0, 80.673714, 4.937416]]), columns=["shotType_Free kick shot", "shotType_Penalty", "shotType_Shot", "shotDistance", "shotAngle"], index=(1,))

clf.predict(predict_df)

array([1])