In [1]:
# start the spark server

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('goal_model').getOrCreate()

In [2]:
# Before we create a model, we need to have a perfect data for it.
# Lets use position of the events for it.
from pyspark.sql.functions import col

events_df = spark.read.csv("../Dataset/events_England.csv", header=True)
events_df = events_df.where(((col("eventName") == "Free Kick") & (col("subEventName").isin(['Free kick shot', 'Penalty']))) | (col("eventName") == "Shot"))
events_df.show(truncate=False)


+-------+------------+--------------------------------------------------------------------+--------+------------------------------------------+-------+---------+------+-----------+-----------------+----------+---------+----------------------------+----------+----------+----------+----------+
|eventId|subEventName|tags                                                                |playerId|positions                                 |matchId|eventName|teamId|matchPeriod|eventSec         |subEventId|id       |tagsList                    |pos_orig_y|pos_orig_x|pos_dest_y|pos_dest_x|
+-------+------------+--------------------------------------------------------------------+--------+------------------------------------------+-------+---------+------+-----------+-----------------+----------+---------+----------------------------+----------+----------+----------+----------+
|10     |Shot        |[{'id': 101}, {'id': 402}, {'id': 201}, {'id': 1205}, {'id': 1801}] |25413   |[{'y': 41, 'x': 88}, 

In [3]:
matches_df = spark.read.csv("../Dataset/matches_England.csv", header=True)
matches_df.show()

+------+-------+--------+--------------------+--------+-------------------+------+--------------------+-------+--------------------+--------------------+--------------------+--------+-------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+
|status|roundId|gameweek|           teamsData|seasonId|            dateutc|winner|               venue|   wyId|               label|                date|            referees|duration|competitionId|team1.scoreET|team1.coachId|team1.side|team1.teamId|team1.score|team1.scoreP|team1.hasFormation|     team1.formation|team1.scoreHT|team1.formation.bench|team1.formation.lineup|team1.formation.s

In [4]:
# Left joinin matches and events dataframe

combine_df = events_df.join(matches_df, on=matches_df.wyId == events_df.matchId, how="left")
combine_df.show()

+-------+------------+--------------------+--------+--------------------+-------+---------+------+-----------+-----------------+----------+---------+--------------------+----------+----------+----------+----------+------+-------+--------+--------------------+--------+-------------------+------+----------------+-------+--------------------+--------------------+--------------------+--------+-------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+
|eventId|subEventName|                tags|playerId|           positions|matchId|eventName|teamId|matchPeriod|         eventSec|subEventId|       id|            tagsList|pos_orig_y|

In [5]:
# first we split the tagslist into proper datatype

from pyspark.sql.functions import contains

combine_df = combine_df.withColumn('isGoal', col('tagsList').contains("101"))
combine_df.show()


+-------+------------+--------------------+--------+--------------------+-------+---------+------+-----------+-----------------+----------+---------+--------------------+----------+----------+----------+----------+------+-------+--------+--------------------+--------+-------------------+------+----------------+-------+--------------------+--------------------+--------------------+--------+-------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+------+
|eventId|subEventName|                tags|playerId|           positions|matchId|eventName|teamId|matchPeriod|         eventSec|subEventId|       id|            tagsList|pos_

In [6]:
from pyspark.sql.types import IntegerType

# First lets change the string datatype into integer type for position

combine_df = combine_df.withColumn("pos_orig_x", col("pos_orig_x").cast(IntegerType())).withColumn("pos_orig_y",col("pos_orig_y").cast(IntegerType()))
combine_df.show()

+-------+------------+--------------------+--------+--------------------+-------+---------+------+-----------+-----------------+----------+---------+--------------------+----------+----------+----------+----------+------+-------+--------+--------------------+--------+-------------------+------+----------------+-------+--------------------+--------------------+--------------------+--------+-------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+------+
|eventId|subEventName|                tags|playerId|           positions|matchId|eventName|teamId|matchPeriod|         eventSec|subEventId|       id|            tagsList|pos_

In [7]:
# new_df = combine_df.withColumn("shotDistance", GetShotDistanceToGoal(col("pos_orig_x"), col("pos_orig_y"))).withColumn("shotAngle", GetShotAngleToGoal(col("pos_orig_x"), col("pos_orig_y")))
from pyspark.sql.functions import sqrt, pow
new_df = combine_df.withColumn("shotDistance", sqrt(pow((100 - col("pos_orig_x")) * (105/100), 2) + pow((50 - col("pos_orig_y")) * 68/100, 2)))

In [8]:
from pyspark.sql.functions import try_divide, acos, pi

divident = (
              (pow(105 - (col("pos_orig_x") * (105/100)), 2) + pow(34 + (7.32/2) - (col("pos_orig_y") * (68/100)), 2)) + 
              (pow(105 - (col("pos_orig_x") * (105/100)), 2) + pow(34 - (7.32/2) - (col("pos_orig_y") * (68/100)), 2)) -
              pow(7.32, 2)
          )
divisor = (2 * 
              (sqrt(pow(105- (col("pos_orig_x") * (105/100)), 2) + pow(34 + (7.32/2) - (col("pos_orig_y") * (68/100)), 2)) * (
              sqrt(pow(105 - (col("pos_orig_x") * (105/100)), 2) + pow(34 - (7.32/2) - (col("pos_orig_y") * (68/100)), 2))
              )
              )
       )
new_df = new_df.select("*", try_divide(divident, divisor).alias("_divison"))
new_df = new_df.withColumn("shotAngle", acos(col("_divison")) * (180 / pi()))

In [9]:
new_df.show()

+-------+------------+--------------------+--------+--------------------+-------+---------+------+-----------+-----------------+----------+---------+--------------------+----------+----------+----------+----------+------+-------+--------+--------------------+--------+-------------------+------+----------------+-------+--------------------+--------------------+--------------------+--------+-------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+-------------+-------------+----------+------------+-----------+------------+------------------+--------------------+-------------+---------------------+----------------------+-----------------------------+------+------------------+-------------------+------------------+
|eventId|subEventName|                tags|playerId|           positions|matchId|eventName|teamId|matchPeriod|      

In [10]:
selected_df = new_df.select(col("subEventName").alias("shotType"), col("shotDistance"), col("shotAngle"), col("isGoal"))
selected_df.show()

+--------+------------------+------------------+------+
|shotType|      shotDistance|         shotAngle|isGoal|
+--------+------------------+------------------+------+
|    Shot|14.007655050007479|26.770923352883432|  true|
|    Shot|15.808608414405109|25.986925127855983| false|
|    Shot|4.4147027079974475| 78.78830064826107|  true|
|    Shot|23.057235306948662|15.737204081613136| false|
|    Shot|29.563871532666354|12.585865672300931| false|
|    Shot|12.891873409245067| 26.70215226257294|  true|
|    Shot|  33.9136093626143|11.838110080690875|  true|
|    Shot|15.674297432421016| 24.85523713293647|  true|
|    Shot| 16.12661154737721| 17.30640000401402| false|
|    Shot|29.905733563984015|13.255181730629005| false|
|    Shot|6.8620988043017865| 53.84777099608625|  true|
|    Shot|10.007677053142753| 35.32634581193244| false|
|    Shot|30.176149522429135| 13.48842341488603|  true|
|    Shot|13.717583606451978|29.756177313507177|  true|
|    Shot|               8.4|47.086889734873346|

In [11]:
selected_df.printSchema()

root
 |-- shotType: string (nullable = true)
 |-- shotDistance: double (nullable = true)
 |-- shotAngle: double (nullable = true)
 |-- isGoal: boolean (nullable = true)



In [12]:
selected_df.repartition(1).write.csv("../data/processed/collection/dataCollect.csv", header=True)