In [12]:
# start the spark server

from pyspark.sql import SparkSession
import findspark
findspark.init()
spark=SparkSession.builder.appName('goal_model').getOrCreate()

In [13]:
# First we need to find the distance from shot to the goal
# We will leverage a UDF for it.
import math
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

@udf(FloatType())
def GetShotDistanceToGoal(x: int, y: int):
    ''' 
        Translate 0-100 (x,y) coordinate-based distances to absolute positions
        using "average" field dimensions of 105x68 before combining in 2D dist calc.

        Parameter:
            x (int): X-coordinates
            y (int): Y-coordinates
        
        Return:
            Float type 
    '''

    return math.sqrt(math.pow((100 - x) * (105/100), 2) + math.pow((50 - y) * 68/100, 2))

In [14]:
from pyspark.sql.functions import try_divide
import math

@udf(FloatType())
def GetShotAngleToGoal(x: int, y: int):
    return math.acos(
        try_divide(
            (
                (math.pow(105 - (x * (105/100)), 2) + math.pow(34 + (7.32/2) - (y * (68/100)), 2)) + 
                (math.pow(105 - (x * (105/100)), 2) + math.pow(34 - (7.32/2) - (y * (68/100)), 2)) -
                math.pow(7.32, 2)
            ),
            (2 * 
                (math.sqrt(math.pow(105- (x * (105/100)), 2) + math.pow(34 + (7.32/2) - (y * (68/100)), 2)) * (
                     math.sqrt(math.pow(105 - (x * (105/100)), 2) + math.pow(34 - (7.32/2) - (y * (68/100)), 2))
                    )
                )
             )
        ) * (180/ math.acos(-1))
    )

In [15]:
# Before we create a model, we need to have a perfect data for it.
# Lets use position of the events for it.
from pyspark.sql.functions import col

events_df = spark.read.csv("../Dataset/events_England.csv", header=True)
events_df = events_df.where(((col("eventName") == "Free Kick") & (col("subEventName").isin(['Free kick shot', 'Penalty']))) | (col("eventName") == "Shot"))
events_df.show(truncate=False)


+-------+------------+--------------------------------------------------------------------+--------+------------------------------------------+-------+---------+------+-----------+-----------------+----------+---------+----------------------------+----------+----------+----------+----------+
|eventId|subEventName|tags                                                                |playerId|positions                                 |matchId|eventName|teamId|matchPeriod|eventSec         |subEventId|id       |tagsList                    |pos_orig_y|pos_orig_x|pos_dest_y|pos_dest_x|
+-------+------------+--------------------------------------------------------------------+--------+------------------------------------------+-------+---------+------+-----------+-----------------+----------+---------+----------------------------+----------+----------+----------+----------+
|10     |Shot        |[{'id': 101}, {'id': 402}, {'id': 201}, {'id': 1205}, {'id': 1801}] |25413   |[{'y': 41, 'x': 88}, 

In [16]:
matches_df = spark.read.csv("../Dataset/matches_England.csv", header=True)
matches_df.show()

+------+-------+--------+--------------------+--------+-------------------+------+--------------------+-------+--------------------+--------------------+--------------------+--------+-------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+
|status|roundId|gameweek|           teamsData|seasonId|            dateutc|winner|               venue|   wyId|               label|                date|            referees|duration|competitionId|team1.scoreET|team1.coachId|team1.side|team1.teamId|team1.score|team1.scoreP|team1.hasFormation|     team1.formation|team1.scoreHT|team1.formation.bench|team1.formation.lineup|team1.formation.s

In [17]:
# Left joinin matches and events dataframe

combine_df = events_df.join(matches_df, on=matches_df.wyId == events_df.matchId, how="left")
combine_df.printSchema()

events_df = None
matches_df = None

root
 |-- eventId: string (nullable = true)
 |-- subEventName: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- playerId: string (nullable = true)
 |-- positions: string (nullable = true)
 |-- matchId: string (nullable = true)
 |-- eventName: string (nullable = true)
 |-- teamId: string (nullable = true)
 |-- matchPeriod: string (nullable = true)
 |-- eventSec: string (nullable = true)
 |-- subEventId: string (nullable = true)
 |-- id: string (nullable = true)
 |-- tagsList: string (nullable = true)
 |-- pos_orig_y: string (nullable = true)
 |-- pos_orig_x: string (nullable = true)
 |-- pos_dest_y: string (nullable = true)
 |-- pos_dest_x: string (nullable = true)
 |-- status: string (nullable = true)
 |-- roundId: string (nullable = true)
 |-- gameweek: string (nullable = true)
 |-- teamsData: string (nullable = true)
 |-- seasonId: string (nullable = true)
 |-- dateutc: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- venue: string (nullable = true

In [18]:
# first we split the tagslist into proper datatype

from pyspark.sql.functions import split, udf
from pyspark.sql.types import StringType

@udf(StringType())
def parse_value(list_str):
    if '101' in list_str:
        return 'True'
    return 'False' 

combine_df = combine_df.withColumn('isGoal', parse_value('tagsList'))
combine_df.printSchema()


root
 |-- eventId: string (nullable = true)
 |-- subEventName: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- playerId: string (nullable = true)
 |-- positions: string (nullable = true)
 |-- matchId: string (nullable = true)
 |-- eventName: string (nullable = true)
 |-- teamId: string (nullable = true)
 |-- matchPeriod: string (nullable = true)
 |-- eventSec: string (nullable = true)
 |-- subEventId: string (nullable = true)
 |-- id: string (nullable = true)
 |-- tagsList: string (nullable = true)
 |-- pos_orig_y: string (nullable = true)
 |-- pos_orig_x: string (nullable = true)
 |-- pos_dest_y: string (nullable = true)
 |-- pos_dest_x: string (nullable = true)
 |-- status: string (nullable = true)
 |-- roundId: string (nullable = true)
 |-- gameweek: string (nullable = true)
 |-- teamsData: string (nullable = true)
 |-- seasonId: string (nullable = true)
 |-- dateutc: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- venue: string (nullable = true

In [19]:
combine_df.show()

+-------+------------+--------------------+--------+--------------------+-------+---------+------+-----------+-----------------+----------+---------+--------------------+----------+----------+----------+----------+------+-------+--------+--------------------+--------+-------------------+------+----------------+-------+--------------------+--------------------+--------------------+--------+-------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+------+
|eventId|subEventName|                tags|playerId|           positions|matchId|eventName|teamId|matchPeriod|         eventSec|subEventId|       id|            tagsList|pos_