# Datapreprocessing

#### We start by loading in the Data Dictionary. All data fields are explained in the dictionary as well as a variety of extra information that may or may not be useful depedending on what one may want to do.

#### We will now load in the dataset of all the shots from the 2024-2025 season so far (the season and its playoffs are still currently ongoing). 

### We will also be removing certain columns of data that will not be needed in later cells.

In [63]:
#Imports
import pandas as pd
from pgmpy.estimators import BayesianEstimator
# from pgmpy import BayesianModel
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.inference import VariableElimination


In [64]:
#Loading in the data
df_Dictionary = pd.read_csv('Data/MoneyPuck_Shot_Data_Dictionary.csv')
# print(df_Dictionary.to_string()) 

In [65]:
#Loading in the complete shot dataset of the 2024-2025 season including shots from the playoffs so far.
df_Shots_Dataset = pd.read_csv('Data/shots_2024.csv')
# print(df_Shots_Dataset.to_string()) 

In [66]:
#Basic data manipulation
rows, columns = df_Shots_Dataset.shape
print(f"Rows: {rows}, Columns: {columns}")

Rows: 112297, Columns: 137


In [67]:
column_names = df_Shots_Dataset.columns.tolist()
print(column_names)

['shotID', 'arenaAdjustedShotDistance', 'arenaAdjustedXCord', 'arenaAdjustedXCordABS', 'arenaAdjustedYCord', 'arenaAdjustedYCordAbs', 'averageRestDifference', 'awayEmptyNet', 'awayPenalty1Length', 'awayPenalty1TimeLeft', 'awaySkatersOnIce', 'awayTeamCode', 'awayTeamGoals', 'defendingTeamAverageTimeOnIce', 'defendingTeamAverageTimeOnIceOfDefencemen', 'defendingTeamAverageTimeOnIceOfDefencemenSinceFaceoff', 'defendingTeamAverageTimeOnIceOfForwards', 'defendingTeamAverageTimeOnIceOfForwardsSinceFaceoff', 'defendingTeamAverageTimeOnIceSinceFaceoff', 'defendingTeamDefencemenOnIce', 'defendingTeamForwardsOnIce', 'defendingTeamMaxTimeOnIce', 'defendingTeamMaxTimeOnIceOfDefencemen', 'defendingTeamMaxTimeOnIceOfDefencemenSinceFaceoff', 'defendingTeamMaxTimeOnIceOfForwards', 'defendingTeamMaxTimeOnIceOfForwardsSinceFaceoff', 'defendingTeamMaxTimeOnIceSinceFaceoff', 'defendingTeamMinTimeOnIce', 'defendingTeamMinTimeOnIceOfDefencemen', 'defendingTeamMinTimeOnIceOfDefencemenSinceFaceoff', 'defendin

In [68]:
#Column names to delete
columns_to_drop = ["arenaAdjustedXCord", "arenaAdjustedXCordABS", "arenaAdjustedYCord", "arenaAdjustedYCordAbs", "awayEmptyNet", 
                   "awayPenalty1Length", "awayPenalty1TimeLeft", "awayTeamCode", "awayTeamGoals", "defendingTeamAverageTimeOnIce", 
                   "defendingTeamAverageTimeOnIceOfDefencemen", "defendingTeamAverageTimeOnIceOfDefencemenSinceFaceoff", 
                   "defendingTeamAverageTimeOnIceOfForwards", "defendingTeamAverageTimeOnIceOfForwardsSinceFaceoff", 
                   "defendingTeamAverageTimeOnIceSinceFaceoff","defendingTeamMaxTimeOnIce", 
                   "defendingTeamMaxTimeOnIceOfDefencemen", "defendingTeamMaxTimeOnIceOfDefencemenSinceFaceoff", 
                   "defendingTeamMaxTimeOnIceOfForwards", "defendingTeamMaxTimeOnIceOfForwardsSinceFaceoff", 
                   "defendingTeamMaxTimeOnIceSinceFaceoff", "defendingTeamMinTimeOnIce", "defendingTeamMinTimeOnIceOfDefencemen", 
                   "defendingTeamMinTimeOnIceOfDefencemenSinceFaceoff", "defendingTeamMinTimeOnIceOfForwards",
                   "defendingTeamMinTimeOnIceOfForwardsSinceFaceoff", "defendingTeamMinTimeOnIceSinceFaceoff", 
                   "distanceFromLastEvent", "event", "gameOver", "game_id", "goalieIdForShot", "homeEmptyNet", "homePenalty1Length", 
                   "homePenalty1TimeLeft", "homeTeamCode", "homeTeamGoals", "homeTeamScore", "homeTeamWon", "homeWinProbability",
                   "id", "isHomeTeam", "lastEventCategory", "lastEventShotAngle", "lastEventShotDistance", "lastEventTeam", "lastEventxCord", 
                   "lastEventxCord_adjusted", "lastEventyCord", "lastEventyCord_adjusted", "location", "offWing", "penaltyLength", "period", 
                   "playerNumThatDidEvent", "playerNumThatDidLastEvent", "roadTeamCode", "roadTeamScore", "season", "shooterPlayerId", 
                   "shooterTimeOnIce", "shooterTimeOnIceSinceFaceoff", "shootingTeamAverageTimeOnIce", "shootingTeamAverageTimeOnIceOfDefencemen", 
                   "shootingTeamAverageTimeOnIceOfDefencemenSinceFaceoff", "shootingTeamAverageTimeOnIceOfForwards", 
                   "shootingTeamAverageTimeOnIceOfForwardsSinceFaceoff", "shootingTeamAverageTimeOnIceSinceFaceoff", "shootingTeamMaxTimeOnIce", "shootingTeamMaxTimeOnIceOfDefencemen", 
                   "shootingTeamMaxTimeOnIceOfDefencemenSinceFaceoff", "shootingTeamMaxTimeOnIceOfForwards", 
                   "shootingTeamMaxTimeOnIceOfForwardsSinceFaceoff", "shootingTeamMaxTimeOnIceSinceFaceoff", "shootingTeamMinTimeOnIce", 
                   "shootingTeamMinTimeOnIceOfDefencemen", "shootingTeamMinTimeOnIceOfDefencemenSinceFaceoff", 
                   "shootingTeamMinTimeOnIceOfForwards", "shootingTeamMinTimeOnIceOfForwardsSinceFaceoff", "shootingTeamMinTimeOnIceSinceFaceoff", 
                   "shotAngle", "shotAnglePlusRebound", "shotAnglePlusReboundSpeed", "shotAngleReboundRoyalRoad", "shotGeneratedRebound", 
                   "shotGoalProbability", "shotGoalieFroze", "shotPlayContinued", "shotPlayContinuedInZone", "shotPlayContinuedOutsideZone", 
                   "shotPlayStopped", "shotRebound", "shotRush", "shotWasOnGoal", "speedFromLastEvent", "team", "teamCode", "time", 
                   "timeBetweenEvents", "timeDifferenceSinceChange", "timeLeft", "timeSinceFaceoff", "timeSinceLastEvent", "timeUntilNextEvent", "wentToOT",
                   "wentToShootout", "xCord", "xFroze", "xPlayContinuedInZone", "xPlayContinuedOutsideZone", "xPlayStopped", "xRebound", 
                   "xShotWasOnGoal", "yCord", "playoffGame", "xCordAdjusted", "yCordAdjusted", "shotDistance", "awaySkatersOnIce", "homeSkatersOnIce"]

#Drop one or more columns using df.drop(), df_Final_Shots is our new dataset that we will be using.
df_Final_Shots = df_Shots_Dataset.drop(columns=columns_to_drop, axis=1)
df_Final_Shots

Unnamed: 0,shotID,arenaAdjustedShotDistance,averageRestDifference,defendingTeamDefencemenOnIce,defendingTeamForwardsOnIce,goal,goalieNameForShot,isPlayoffGame,playerPositionThatDidEvent,shooterLeftRight,shooterName,shootingTeamDefencemenOnIce,shootingTeamForwardsOnIce,shotAngleAdjusted,shotOnEmptyNet,shotType,xGoal
0,0,52.000000,0.000000,2,3,0,Ukko-Pekka Luukkonen,0,D,,Simon Nemec,2,3,51.340192,0,WRIST,0.012537
1,1,33.000000,-6.000000,2,3,0,Ukko-Pekka Luukkonen,0,L,L,Jesper Bratt,2,3,57.264774,0,WRIST,0.021962
2,2,48.000000,-12.600000,2,3,0,Ukko-Pekka Luukkonen,0,D,R,Dougie Hamilton,2,3,30.343249,0,SLAP,0.028057
3,3,58.000000,0.000000,2,3,0,Jacob Markstrom,0,R,L,JJ Peterka,2,3,32.855722,0,WRIST,0.009832
4,4,56.000000,0.000000,2,3,0,Jacob Markstrom,0,D,L,Owen Power,2,3,15.802514,0,SLAP,0.028884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112292,75,69.000000,-2.400000,2,3,0,Mackenzie Blackwood,1,L,,Mason Marchment,2,3,29.638384,0,WRIST,0.009716
112293,76,15.000000,0.000000,2,3,0,Mackenzie Blackwood,1,L,,Mason Marchment,2,3,3.814075,0,WRIST,0.238243
112294,77,7.071068,-1.200000,2,3,1,Jake Oettinger,1,D,L,Devon Toews,2,3,8.130102,0,TIP,0.185902
112295,78,56.000000,19.166667,1,5,1,,1,C,R,Nathan MacKinnon,2,3,45.744059,1,WRIST,0.516078


In [69]:
#Deleting all shots that were taking on empty nets whether they were a goal or not

#Delete rows where the shotOnEmptyNet is 1 which means that the shot was made while the goalie was not present in his net.
df_Final_Shots = df_Final_Shots.drop(df_Final_Shots[df_Final_Shots['shotOnEmptyNet'] == 1].index)

#Remove shotOnEmptyNet column
df_Final_Shots = df_Final_Shots.drop('shotOnEmptyNet', axis=1)
# df_Final_Shots


In [70]:
#Deleting all shots where the shooters handiness is deemed NaN in the dataframe
df_Final_Shots.dropna(subset=['shooterLeftRight'], inplace=True)
# df_Final_Shots


In [71]:
#Deleting all other rows where at least one value of NaN appears. We have enough shots to be able to do this. (over 100k)
df_Final_Shots.dropna(inplace=True)
# df_Final_Shots

In [72]:
# Total NaN values in the DataFrame. Should return 0 now.
total_nan_count = df_Final_Shots.isna().sum().sum()
print(total_nan_count)

0


In [73]:
#Combining defendingTeamDefencemenOnIce with defendingTeamForwardsOnIce into a single column named "defendingTeamPlayersOnIce"
df_Final_Shots['DefendersOnIce'] = df_Final_Shots['defendingTeamDefencemenOnIce'] + df_Final_Shots['defendingTeamForwardsOnIce']
# df_Final_Shots

In [74]:
#Combining shootingTeamDefencemenOnIce with shootingTeamForwardsOnIce into a single column named "AttackersOnIce"
df_Final_Shots['AttackersOnIce'] = df_Final_Shots['shootingTeamDefencemenOnIce'] + df_Final_Shots['shootingTeamForwardsOnIce']
# df_Final_Shots

In [75]:
#Deleting all shots taken during the playoffs
df_Final_Shots = df_Final_Shots.drop(df_Final_Shots[df_Final_Shots['isPlayoffGame'] == 1].index)
# df_Final_Shots 

In [76]:
#Removing shootingTeamForwardsOnIce, shootingTeamDefencemenOnIce, defendingTeamDefencemenOnIce and defendingTeamForwardsOnIce columns from the dataframe.

#Column names to delete
columns_to_drop = ["shootingTeamDefencemenOnIce", "shootingTeamForwardsOnIce", "defendingTeamDefencemenOnIce", "defendingTeamForwardsOnIce", "isPlayoffGame"]

#Drop one or more columns using df.drop(), df_Final_Shots is our new dataset that we will be using.
df_Final_Shots = df_Final_Shots.drop(columns=columns_to_drop, axis=1)
# df_Final_Shots

In [77]:
#Duplicate dataframe to have a two column dataframe with shotID and ExpectedGoals -> Useful for comparaison later in the results section
df_Expected_Goals = df_Final_Shots[["shotID", "xGoal"]]
# df_Expected_Goals

# Discretizing the Dataset

We need to discretize the dataset to be able to use it in a Bayesian Model.

In [78]:
#Find the max values in each column
max_values_column = df_Final_Shots.max()
print(max_values_column)

shotID                                   112148
arenaAdjustedShotDistance                  98.0
averageRestDifference                      99.0
goal                                          1
goalieNameForShot              Yaroslav Askarov
playerPositionThatDidEvent                    R
shooterLeftRight                              R
shooterName                   Zemgus Girgensons
shotAngleAdjusted                     88.492564
shotType                                  WRIST
xGoal                                  0.957015
DefendersOnIce                                6
AttackersOnIce                                7
dtype: object


In [79]:
#Find the min values in each column
min_values_column = df_Final_Shots.min()
print(min_values_column)

shotID                                 1
arenaAdjustedShotDistance            1.0
averageRestDifference             -116.8
goal                                   0
goalieNameForShot              Adin Hill
playerPositionThatDidEvent             C
shooterLeftRight                       L
shooterName                   A.J. Greer
shotAngleAdjusted                    0.0
shotType                            BACK
xGoal                           0.001442
DefendersOnIce                         3
AttackersOnIce                         3
dtype: object


In [80]:
#Let's discretize player names and goalie names first. We will divide the shooters and the goalies based on their in season rankings.


#Let's load in the top 150 goalscorers ordered by the number of goals scored.
df_Goals1To50 = pd.read_csv('Data/GoalsRank1To50.csv')
df_Goals1To50 = df_Goals1To50['Player']
# print(df_Goals1To50.to_string()) 

df_Goals51To100 = pd.read_csv('Data/GoalsRank51To100.csv')
df_Goals51To100 = df_Goals51To100['Player']
# print(df_Goals51To100.to_string()) 

df_Goals101To150 = pd.read_csv('Data/GoalsRank101To150.csv')
df_Goals101To150 = df_Goals101To150['Player']
# print(df_Goals101To150.to_string()) 

df_Goals1To150 = pd.concat([df_Goals1To50, df_Goals51To100], ignore_index=True)
df_Goals1To150 = pd.concat([df_Goals1To150, df_Goals101To150], ignore_index=True)
# print(df_Goals1To150.to_string())

shooterNames = df_Goals1To150.to_list()

#Not a top 150 goalscorer
df_Final_Shots["shooterName"] = df_Final_Shots["shooterName"].where(df_Final_Shots["shooterName"].isin(shooterNames), other=15)

#0 is a top 10 scorer, 1 is a top 20 scorer, 2 is a top 30 scorer, etc
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[0:10], 0)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[10:20], 1)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[20:30], 2)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[30:40], 3)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[40:50], 4)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[50:60], 5)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[60:70], 6)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[70:80], 7)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[80:90], 8)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[90:100], 9)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[100:110], 10)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[110:120], 11)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[120:130], 12)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[130:140], 13)
df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[140:150], 14)

# df_Final_Shots

  df_Final_Shots['shooterName'] = df_Final_Shots['shooterName'].replace(shooterNames[140:150], 14)


In [81]:
#Let's load in the top 150 goalscorers ordered by the number of goals scored.
df_Goalies1To50 = pd.read_csv('Data/Goalies1To45.csv')
df_Goalies1To50 = df_Goalies1To50['Player']
# print(df_Goalies1To50.to_string()) 

goalieNames = df_Goalies1To50.to_list()

# print(goalieNames)

#Not a top 45 goalie
df_Final_Shots["goalieNameForShot"] = df_Final_Shots["goalieNameForShot"].where(df_Final_Shots["goalieNameForShot"].isin(goalieNames), other=9)

#0 is a top 5 goalie, 1 is a top 10 goalie, 2 is a top 15 goalie, etc
df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[0:5], 0)
df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[5:10], 1)
df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[10:15], 2)
df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[15:20], 3)
df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[20:25], 4)
df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[25:30], 5)
df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[30:35], 6)
df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[35:40], 7)
df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[40:45], 8)

# df_Final_Shots

  df_Final_Shots['goalieNameForShot'] = df_Final_Shots['goalieNameForShot'].replace(goalieNames[40:45], 8)


In [82]:
#Discretizing arenaAdjustedShotDistance
df_Final_Shots['DiscreteShotDistance'] = pd.cut(df_Final_Shots['arenaAdjustedShotDistance'], bins=10, labels=[0, 1, 2,
                                                                                                              3, 4, 5,
                                                                                                              6, 7, 8, 9])
# df_Final_Shots


In [83]:
#Discretizing shotAngleAdjusted
df_Final_Shots['DiscreteShotAngle'] = pd.cut(df_Final_Shots['shotAngleAdjusted'], bins=10, labels=[0, 1, 2,
                                                                                                   3, 4, 5,
                                                                                                   6, 7, 8, 9])
# df_Final_Shots

In [84]:
#Discretizing averageRestDifference
df_Final_Shots['DiscreteAverageRestDifference'] = pd.cut(df_Final_Shots['averageRestDifference'], bins=11, labels=[0, 1, 2, 
                                                                                                              3, 4, 5,
                                                                                                              6, 7, 8,
                                                                                                              9, 10])
# df_Final_Shots

In [85]:
#Discretizing shooterLeftRight
df_Final_Shots["DiscreteShooterLeftOrRight"], uniques = pd.factorize(df_Final_Shots["shooterLeftRight"])
# df_Final_Shots

In [86]:
#Discretizing shooterLeftRight
df_Final_Shots["DiscretePlayerPosition"], uniques = pd.factorize(df_Final_Shots["playerPositionThatDidEvent"])
# df_Final_Shots

In [None]:
#Discretizing shotType 
df_Final_Shots["DiscreteShotType"], uniques = pd.factorize(df_Final_Shots["shotType"])
df_IDs = df_Final_Shots
# df_Final_Shots

Unnamed: 0,shotID,arenaAdjustedShotDistance,averageRestDifference,goal,goalieNameForShot,playerPositionThatDidEvent,shooterLeftRight,shooterName,shotAngleAdjusted,shotType,xGoal,DefendersOnIce,AttackersOnIce,DiscreteShotDistance,DiscreteShotAngle,DiscreteAverageRestDifference,DiscreteShooterLeftOrRight,DiscretePlayerPosition,DiscreteShotType
1,1,33.000000,-6.00,0,8,L,L,12,57.264774,WRIST,0.021962,5,5,3,6,5,0,0,0
2,2,48.000000,-12.60,0,8,D,R,15,30.343249,SLAP,0.028057,5,5,4,3,5,1,1,1
3,3,58.000000,0.00,0,5,R,L,6,32.855722,WRIST,0.009832,5,5,5,3,5,0,2,0
4,4,56.000000,0.00,0,5,D,L,15,15.802514,SLAP,0.028884,5,5,5,1,5,0,1,1
5,5,11.000000,-29.75,0,8,C,L,2,11.309932,WRIST,0.182827,5,4,1,1,4,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112144,112144,37.802116,-8.20,0,6,L,L,8,37.476180,WRIST,0.059082,5,5,3,4,5,0,0,0
112145,112145,83.815273,20.80,0,8,C,L,15,19.515873,TIP,0.009116,5,5,8,2,7,0,3,2
112146,112146,66.400301,1.40,0,6,D,R,15,37.042475,WRIST,0.007529,5,5,6,4,6,1,1,0
112147,112147,68.505474,8.20,0,6,D,L,15,33.690068,WRIST,0.007924,5,5,6,3,6,0,1,0


In [26]:
#Final Column Names
column_names = df_Final_Shots.columns.tolist()
print(column_names)

['shotID', 'arenaAdjustedShotDistance', 'averageRestDifference', 'goal', 'goalieNameForShot', 'playerPositionThatDidEvent', 'shooterLeftRight', 'shooterName', 'shotAngleAdjusted', 'shotType', 'xGoal', 'DefendersOnIce', 'AttackersOnIce', 'DiscreteShotDistance', 'DiscreteShotAngle', 'DiscreteAverageRestDifference', 'DiscreteShooterLeftOrRight', 'DiscretePlayerPosition', 'DiscreteShotType']


# Creating the Bayesian Network

In [27]:
#Creating the dataframe that will be used for the Bayesian Model
df_Bayesian_Network = df_Final_Shots[['goal', 'DefendersOnIce', 'AttackersOnIce', 'shooterName', 'goalieNameForShot', 
                                      'DiscreteShotDistance', 'DiscreteShotAngle', 'DiscreteAverageRestDifference', 
                                      'DiscreteShooterLeftOrRight', 'DiscretePlayerPosition', 'DiscreteShotType']]
df_Bayesian_Network

Unnamed: 0,goal,DefendersOnIce,AttackersOnIce,shooterName,goalieNameForShot,DiscreteShotDistance,DiscreteShotAngle,DiscreteAverageRestDifference,DiscreteShooterLeftOrRight,DiscretePlayerPosition,DiscreteShotType
1,0,5,5,12,8,3,6,5,0,0,0
2,0,5,5,15,8,4,3,5,1,1,1
3,0,5,5,6,5,5,3,5,0,2,0
4,0,5,5,15,5,5,1,5,0,1,1
5,0,5,4,2,8,1,1,4,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...
112144,0,5,5,8,6,3,4,5,0,0,0
112145,0,5,5,15,8,8,2,7,0,3,2
112146,0,5,5,15,6,6,4,6,1,1,0
112147,0,5,5,15,6,6,3,6,0,1,0


In [28]:
#Creating the above Bayesian Network using 
model = DiscreteBayesianNetwork([('DefendersOnIce', 'goal'), ('AttackersOnIce', 'goal'), ('shooterName', 'goal'), 
                                 ('goalieNameForShot', 'goal'), ('DiscreteShotDistance', 'goal'), ('DiscreteShotAngle', 'goal'), 
                                 ('DiscreteAverageRestDifference', 'goal'), ('DiscreteShooterLeftOrRight','goal'), ('DiscretePlayerPosition', 'goal'), 
                                 ('DiscreteShotType', 'goal'), ('DiscreteShotDistance', 'DiscreteShotType'), 
                                 ('DiscretePlayerPosition', 'DiscreteShotType'), ('shooterName', 'DiscreteShotType'), 
                                 ('DiscreteShotType', 'DiscreteShotAngle'), ('DiscreteShooterLeftOrRight', 'DiscreteShotAngle'), 
                                 ('shooterName', 'DiscreteShooterLeftOrRight'), ('shooterName', 'DiscretePlayerPosition'), 
                                 ('DiscretePlayerPosition', 'DiscreteShotDistance')])

## The Original Bayesian Network:

![BayesianNetwork](BayesianNetworkWithBackground.png)

In [29]:
# Fitting the model to the data -> Training this kills the kernel on my current setup, we need to make the model less computationally expensive
# model.fit(df_Bayesian_Network, estimator=BayesianEstimator, prior_type="BDeu")
# for cpd in model.get_cpds():
#     print(cpd)


## The Modified Bayesian Network:

![BayesianNetwork](ModifiedBayesianNetwork.png)

In [30]:
df_Bayesian_Network_Modified = df_Final_Shots[['goal', 'DefendersOnIce', 'AttackersOnIce', 
                                      'DiscreteShotDistance', 'DiscreteShotAngle', 'DiscreteAverageRestDifference', 
                                      'DiscreteShooterLeftOrRight', 'DiscretePlayerPosition', 'DiscreteShotType']]
df_Bayesian_Network_Modified

Unnamed: 0,goal,DefendersOnIce,AttackersOnIce,DiscreteShotDistance,DiscreteShotAngle,DiscreteAverageRestDifference,DiscreteShooterLeftOrRight,DiscretePlayerPosition,DiscreteShotType
1,0,5,5,3,6,5,0,0,0
2,0,5,5,4,3,5,1,1,1
3,0,5,5,5,3,5,0,2,0
4,0,5,5,5,1,5,0,1,1
5,0,5,4,1,1,4,0,3,0
...,...,...,...,...,...,...,...,...,...
112144,0,5,5,3,4,5,0,0,0
112145,0,5,5,8,2,7,0,3,2
112146,0,5,5,6,4,6,1,1,0
112147,0,5,5,6,3,6,0,1,0


In [31]:
# Fitting the model to the data
model_Modified = DiscreteBayesianNetwork([('DefendersOnIce', 'goal'), ('AttackersOnIce', 'goal'), 
                                 ('DiscreteShotDistance', 'goal'), ('DiscreteShotAngle', 'goal'),
                                 ('DiscreteAverageRestDifference', 'goal'), ('DiscreteShooterLeftOrRight','goal'), ('DiscretePlayerPosition', 'goal'), 
                                 ('DiscreteShotType', 'goal'), ('DiscreteShotDistance', 'DiscreteShotType'), 
                                 ('DiscretePlayerPosition', 'DiscreteShotType'),
                                 ('DiscreteShotType', 'DiscreteShotAngle'), ('DiscreteShooterLeftOrRight', 'DiscreteShotAngle'), 
                                 ('DiscretePlayerPosition', 'DiscreteShotDistance')])


model_Modified.fit(df_Bayesian_Network_Modified, estimator=BayesianEstimator, prior_type="BDeu")
for cpd in model_Modified.get_cpds():
    print(cpd)

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'goal': 'N', 'DefendersOnIce': 'N', 'AttackersOnIce': 'N', 'DiscreteShotDistance': 'O', 'DiscreteShotAngle': 'O', 'DiscreteAverageRestDifference': 'O', 'DiscreteShooterLeftOrRight': 'N', 'DiscretePlayerPosition': 'N', 'DiscreteShotType': 'N'}


+-------------------+-------------+
| DefendersOnIce(3) | 0.0164103   |
+-------------------+-------------+
| DefendersOnIce(4) | 0.139183    |
+-------------------+-------------+
| DefendersOnIce(5) | 0.84431     |
+-------------------+-------------+
| DefendersOnIce(6) | 9.64897e-05 |
+-------------------+-------------+
+-------------------------------+-----+
| AttackersOnIce                | ... |
+-------------------------------+-----+
| DefendersOnIce                | ... |
+-------------------------------+-----+
| DiscreteAverageRestDifference | ... |
+-------------------------------+-----+
| DiscretePlayerPosition        | ... |
+-------------------------------+-----+
| DiscreteShooterLeftOrRight    | ... |
+-------------------------------+-----+
| DiscreteShotAngle             | ... |
+-------------------------------+-----+
| DiscreteShotDistance          | ... |
+-------------------------------+-----+
| DiscreteShotType              | ... |
+-------------------------------+---

# Testing the Model on Real Situations

In [59]:
#Get 5 random shots
random_sample = df_IDs.sample(n=5)
random_sample


Unnamed: 0,shotID,arenaAdjustedShotDistance,averageRestDifference,goal,goalieNameForShot,playerPositionThatDidEvent,shooterLeftRight,shooterName,shotAngleAdjusted,shotType,xGoal,DefendersOnIce,AttackersOnIce,DiscreteShotDistance,DiscreteShotAngle,DiscreteAverageRestDifference,DiscreteShooterLeftOrRight,DiscretePlayerPosition,DiscreteShotType
1728,1728,54.0,0.0,0,5,D,L,15,11.944177,SLAP,0.02078,5,5,5,1,5,0,1,1
18636,18636,58.034473,-1.833333,0,4,D,R,15,1.974934,WRIST,0.022352,5,6,5,0,5,1,1,0
3725,3725,8.246211,0.0,1,6,L,L,10,14.036243,TIP,0.178994,4,5,0,1,5,0,0,2
33161,33161,40.0,-6.0,0,7,D,L,15,66.614779,WRIST,0.009508,5,5,4,7,5,0,1,0
15633,15633,23.0,0.0,1,9,C,L,15,39.805571,BACK,0.022074,5,5,2,4,5,0,3,5


In [None]:
#Inference using variable elimination. goal(1) is the expected goal value
ExpectedGoalModel = VariableElimination(model_Modified)

shot1 = ExpectedGoalModel.query(variables=['goal'], evidence={'DefendersOnIce': 5, 'AttackersOnIce': 5, 
                                                              'DiscreteShotDistance': 5, 'DiscreteShotAngle': 1, 
                                                              'DiscreteAverageRestDifference':5, 'DiscreteShooterLeftOrRight': 0, 
                                                              'DiscretePlayerPosition':1, 'DiscreteShotType':1})

shot2 = ExpectedGoalModel.query(variables=['goal'], evidence={'DefendersOnIce': 5, 'AttackersOnIce': 6, 
                                                              'DiscreteShotDistance': 5, 'DiscreteShotAngle': 0, 
                                                              'DiscreteAverageRestDifference':5, 'DiscreteShooterLeftOrRight': 1, 
                                                              'DiscretePlayerPosition':1, 'DiscreteShotType':0})

shot3 = ExpectedGoalModel.query(variables=['goal'], evidence={'DefendersOnIce': 4, 'AttackersOnIce': 5, 
                                                              'DiscreteShotDistance': 0, 'DiscreteShotAngle': 1, 
                                                              'DiscreteAverageRestDifference':5, 'DiscreteShooterLeftOrRight': 0, 
                                                              'DiscretePlayerPosition':0, 'DiscreteShotType':2})

shot4 = ExpectedGoalModel.query(variables=['goal'], evidence={'DefendersOnIce': 5, 'AttackersOnIce': 5, 
                                                              'DiscreteShotDistance': 4, 'DiscreteShotAngle': 7, 
                                                              'DiscreteAverageRestDifference':5, 'DiscreteShooterLeftOrRight': 0, 
                                                              'DiscretePlayerPosition':1, 'DiscreteShotType':0})

shot5 = ExpectedGoalModel.query(variables=['goal'], evidence={'DefendersOnIce': 5, 'AttackersOnIce': 5, 
                                                              'DiscreteShotDistance': 2, 'DiscreteShotAngle': 4, 
                                                              'DiscreteAverageRestDifference':5, 'DiscreteShooterLeftOrRight': 0, 
                                                              'DiscretePlayerPosition':3, 'DiscreteShotType':5})

print(shot1) #My value = 0.0244; Moneypuck xGoal value = 0.020780; Was it actually a goal -> No
print(shot2) #My value = 0.0000; Moneypuck xGoal value = 0.022352; Was it actually a goal -> No 
print(shot3) #My value = 0.2903; Moneypuck xGoal value = 0.178994	; Was it actually a goal -> Yes
print(shot4) #My value = 0.0103; Moneypuck xGoal value = 0.009508; Was it actually a goal -> No 
print(shot5) #My value =  0.0476; Moneypuck xGoal value = 0.022074	; Was it actually a goal -> Yes

+---------+-------------+
| goal    |   phi(goal) |
| goal(0) |      0.9756 |
+---------+-------------+
| goal(1) |      0.0244 |
+---------+-------------+
+---------+-------------+
| goal    |   phi(goal) |
| goal(0) |      1.0000 |
+---------+-------------+
| goal(1) |      0.0000 |
+---------+-------------+
+---------+-------------+
| goal    |   phi(goal) |
| goal(0) |      0.7097 |
+---------+-------------+
| goal(1) |      0.2903 |
+---------+-------------+
+---------+-------------+
| goal    |   phi(goal) |
| goal(0) |      0.9897 |
+---------+-------------+
| goal(1) |      0.0103 |
+---------+-------------+
+---------+-------------+
| goal    |   phi(goal) |
| goal(0) |      0.9524 |
+---------+-------------+
| goal(1) |      0.0476 |
+---------+-------------+
