In [13]:
import pandas as pd
import numpy as np

In [14]:
# start the spark server

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('goal_model').getOrCreate()

In [15]:
# Casting proper datatype to each column

from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, BooleanType, StringType

df = spark.read.csv("../../data/processed/collection/dataCollect.csv", header=True)
df = df.withColumn("isGoal", col("isGoal").cast(BooleanType())) \
    .withColumn("shotDistance", col("shotDistance").cast(FloatType())) \
    .withColumn("shotAngle", col("shotAngle").cast(FloatType()))
df.show()

+--------+------------+---------+------+
|shotType|shotDistance|shotAngle|isGoal|
+--------+------------+---------+------+
|    Shot|   14.007655|26.770924|  true|
|    Shot|   15.808608|25.986925| false|
|    Shot|    4.414703|  78.7883|  true|
|    Shot|   23.057236|15.737205| false|
|    Shot|   29.563871|12.585866| false|
|    Shot|   12.891873|26.702152|  true|
|    Shot|    33.91361| 11.83811|  true|
|    Shot|   15.674297|24.855238|  true|
|    Shot|   16.126612|  17.3064| false|
|    Shot|   29.905733|13.255181| false|
|    Shot|   6.8620987| 53.84777|  true|
|    Shot|   10.007677|35.326347| false|
|    Shot|    30.17615|13.488423|  true|
|    Shot|   13.717584|29.756178|  true|
|    Shot|         8.4| 47.08689|  true|
|    Shot|   31.903494|12.511723| false|
|    Shot|         8.4| 47.08689| false|
|    Shot|   26.953978|15.080159| false|
|    Shot|    8.829406|43.602642|  true|
|    Shot|   24.388163|16.912554| false|
+--------+------------+---------+------+
only showing top

In [16]:
# convert df to pandas

pandas_df = df.toPandas()
pandas_df = pd.get_dummies(pandas_df, columns=['shotType'])
pandas_df['isGoal'] = pandas_df['isGoal'].astype(int)
pandas_df['shotType_Free kick shot'] = pandas_df['shotType_Free kick shot'].astype(int)
pandas_df['shotType_Penalty'] = pandas_df['shotType_Penalty'].astype(int)
pandas_df['shotType_Shot'] = pandas_df['shotType_Shot'].astype(int)

In [30]:
# Shuffle and split the dataset

dataset = pandas_df.sample(frac=1) # shuffle the dataset

# Get the feature and output column
output_df = dataset["isGoal"]
features_df = dataset.drop('isGoal', axis=1)
train_size = int(0.7 * len(pandas_df))


X_train, X_test = features_df[:train_size], features_df[train_size:]
y_train, y_test = output_df[:train_size], output_df[train_size:]
print(X_train)

      shotDistance  shotAngle  shotType_Free kick shot  shotType_Penalty  \
5423     32.854321   9.817325                        0                 0   
8068     51.200008   7.554289                        0                 0   
2993     86.142952   4.863369                        0                 0   
8129     97.751534   4.238184                        0                 0   
5622     79.356514   4.893537                        0                 0   
...            ...        ...                      ...               ...   
1857     73.476662   5.624115                        0                 0   
2214     87.173874   4.806999                        0                 0   
1517     42.805027   7.700943                        0                 0   
1893     47.019676   6.781835                        0                 0   
1792     32.684528  11.530751                        0                 0   

      shotType_Shot  
5423              1  
8068              1  
2993              1  

In [72]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, criterion="entropy")  # Adjust hyperparameters as needed
model.fit(X_train, y_train)

In [73]:
# checking the accuracy of the model

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, model.predict(X_test))
print("Model Accuracy:", accuracy)

Model Accuracy: 0.579954954954955


In [74]:
prediction_array = np.array([[40.937416, 16.673714, 1, 0, 0]])
df_column = ["shotDistance", "shotAngle", "shotType_Free kick shot", "shotType_Penalty", "shotType_Shot"]
predict_new = pd.DataFrame(prediction_array, columns=df_column, index=(1,))

predictions = model.predict(predict_new)

if predictions[0]:
    print("The following shot will result into Goal !!!")
else:  
    print("Shot got saved.")

The following shot will result into Goal !!!
