In [0]:
# To transpose all statistics to the right side of the rink for easier computation
#Columns needed: play_id (text) (PK), event (text), date (date), secondaryType (text)
#Transposed x (x1): if x >= 0, x1 =  x; if x < 0, x1 = x * -1
#Transposed y (y1): if x is positive, y1 = y; if x < 0, y1 = y * -1
#Goal post position (gp_x) = 89 (fixed value)
#x is not NULL (x is not null from 2011)
#-89 =< x <= 89 (to remove shot at goal that originates from behind the goal post)


### Step 1: Load data from silver layer

In [0]:
# Load the parquet file into a DataFrame
xy_transform_path = "/mnt/nhl-finalproject/silver/xy_game_plays"

xy_silver = spark.read.format("delta").load(xy_transform_path)

### Step 2: Transformation
- add new column x1 (int), y1 (int), gp_x(int)
- create the new table required for analysis 

In [0]:
from pyspark.sql import functions as F

# Create new DF called df_xy which extracts requested fields and calculates new fields. 
df_xy = (
    xy_silver.select(
        'play_id', 
        'event', 
        'datetime', 
        'secondarytype', 
        # Creates the x1 field : The F.abs() function calculates the absolute value of the x column, turning negative values into positive ones. 
        F.abs(xy_silver['x']).alias('x1'),  
        
        # Creates the y1 field : Flip y based on x -- F.when(...) checks if the value of x is greater than or equal to zero. If true, returns original value of y, otherwise(...): multiplies y by -1 to flip the sign.The final result is named y1.
        F.when(xy_silver['x'] >= 0, xy_silver['y']).otherwise(xy_silver['y'] * -1).alias('y1'),  

        # Creates the gp_x field, setting it to 89.  F.lit() creates a literal (constant) value
        F.lit(89).alias('gp_x') 
    )
)
df_xy.display()


play_id,event,datetime,secondarytype,x1,y1,gp_x
2016020045_73,Hit,2016-10-19T02:04:12Z,,73.0,40.0,89
2017020812_208,Blocked Shot,2018-02-07T02:05:58Z,,48.0,-17.0,89
2015020849_53,Faceoff,2016-02-17T00:22:22Z,,69.0,22.0,89
2015020849_127,Faceoff,2016-02-17T01:09:08Z,,69.0,-22.0,89
2015020849_253,Shot,2016-02-17T02:14:08Z,Snap Shot,39.0,21.0,89
2016020610_237,Faceoff,2017-01-10T02:30:56Z,,69.0,-22.0,89
2017020624_45,Shot,2018-01-05T02:22:50Z,Wrist Shot,44.0,-26.0,89
2017020624_67,Missed Shot,2018-01-05T02:30:37Z,,79.0,20.0,89
2017020221_77,Penalty,2017-11-07T02:04:05Z,Closing hand on puck,36.0,-3.0,89
2016020298_56,Penalty,2016-11-24T03:57:34Z,Tripping,56.0,35.0,89


In [0]:

# save transformed dataframe to parquet files in silver layer
df_xy.write.format('delta').mode('overwrite').save('/mnt/nhl-finalproject/gold/game_plays_xy')