Importing all necessary libraries. 

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier

Loading data from the CSV file. 

In [7]:
# printing column names (for reference, delete later)
df = pd.read_csv('all_teams.csv')

for col in df.columns:
        print(col)

team
season
name
gameId
playerTeam
opposingTeam
home_or_away
gameDate
position
situation
xGoalsPercentage
corsiPercentage
fenwickPercentage
iceTime
xOnGoalFor
xGoalsFor
xReboundsFor
xFreezeFor
xPlayStoppedFor
xPlayContinuedInZoneFor
xPlayContinuedOutsideZoneFor
flurryAdjustedxGoalsFor
scoreVenueAdjustedxGoalsFor
flurryScoreVenueAdjustedxGoalsFor
shotsOnGoalFor
missedShotsFor
blockedShotAttemptsFor
shotAttemptsFor
goalsFor
reboundsFor
reboundGoalsFor
freezeFor
playStoppedFor
playContinuedInZoneFor
playContinuedOutsideZoneFor
savedShotsOnGoalFor
savedUnblockedShotAttemptsFor
penaltiesFor
penalityMinutesFor
faceOffsWonFor
hitsFor
takeawaysFor
giveawaysFor
lowDangerShotsFor
mediumDangerShotsFor
highDangerShotsFor
lowDangerxGoalsFor
mediumDangerxGoalsFor
highDangerxGoalsFor
lowDangerGoalsFor
mediumDangerGoalsFor
highDangerGoalsFor
scoreAdjustedShotsAttemptsFor
unblockedShotAttemptsFor
scoreAdjustedUnblockedShotAttemptsFor
dZoneGiveawaysFor
xGoalsFromxReboundsOfShotsFor
xGoalsFromActualRebou

In [8]:
print("DATASET OVERVIEW")
print(f"\nDataset Shape: {df.shape}")
print(f"Number of observations: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

print("FIRST FEW ROWS")
print(df.head())

print("BASIC STATISTICS")
print(df.describe())

print("DATA TYPES")
print(df.dtypes)

# checking for missing data 
print("MISSING DATA")
missing_data = df.isnull().sum()
print(missing_data)
print(f"\nTotal missing values: {missing_data.sum()}")

if missing_data.sum() > 0:
    print("\nDropping rows with missing values...")
    df = df.dropna()
    print(f"Dataset shape after removing missing values: {df.shape}")
    print(f"New number of observations: {df.shape[0]}")
    print(f"New number of features: {df.shape[1]}")

# checking for duplicate data 
print("DUPLICATE DATA")
print("Number of Duplicate Rows: ", df.duplicated().sum())

# drop any duplicate data 
if df.duplicated().sum() > 0:
    print("Dropping duplicates...")
    df = df.drop_duplicates()
    print("Dataset shape after dropping duplicates: ", df.shape)
    print("New number of observations: ", df.shape[0])

print("Updated Data:")
print(df.head())

DATASET OVERVIEW

Dataset Shape: (221620, 111)
Number of observations: 221620
Number of features: 111
FIRST FEW ROWS
  team  season name      gameId playerTeam opposingTeam home_or_away  \
0  NYR    2008  NYR  2008020001        NYR          T.B         AWAY   
1  NYR    2008  NYR  2008020001        NYR          T.B         AWAY   
2  NYR    2008  NYR  2008020001        NYR          T.B         AWAY   
3  NYR    2008  NYR  2008020001        NYR          T.B         AWAY   
4  NYR    2008  NYR  2008020001        NYR          T.B         AWAY   

   gameDate    position situation  ...  unblockedShotAttemptsAgainst  \
0  20081004  Team Level     other  ...                           1.0   
1  20081004  Team Level       all  ...                          31.0   
2  20081004  Team Level      5on5  ...                          20.0   
3  20081004  Team Level      4on5  ...                           9.0   
4  20081004  Team Level      5on4  ...                           1.0   

   scoreAdjustedU

In [9]:
# identify target variable 
df['scored'] = (df['goalsFor'] > 0).astype(int)
df = df.drop(columns=['goalsFor'])

# identify categorical data (need to know for one hot encoding) 
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)

Categorical Columns: ['team', 'name', 'playerTeam', 'opposingTeam', 'home_or_away', 'position', 'situation']


In [10]:
# perform one hot encoding 
ohe_cols = ['home_or_away', 'position', 'situation']
df = pd.get_dummies(df, columns=ohe_cols, drop_first=False)

# drop features we don't want included in the model 
drop_cols = ['team', 'name', 'playerTeam', 'opposingTeam']
df = df.drop(columns=drop_cols)

print("Updated Data Types: ", df.dtypes)
print("Updated Shape: ", df.shape)

Updated Data Types:  season                int64
gameId                int64
gameDate              int64
xGoalsPercentage    float64
corsiPercentage     float64
                     ...   
situation_4on5         bool
situation_5on4         bool
situation_5on5         bool
situation_all          bool
situation_other        bool
Length: 112, dtype: object
Updated Shape:  (221620, 112)


In [14]:
# drop identifying data we don't want 
df = df.drop(columns=['gameId', 'gameDate'])

# split dataset for training (80%) and testing (20%)
y = df['scored']
X = df.drop(columns=['scored'])

# print dataset, check for >1 target class (cannot have a 1-class classification problem)
print(f"Feature Matrix Shape: {X.shape}")
print(f"Target Vector: {y.shape}")
print(f"Target Classes: {y.unique()}")

Feature Matrix Shape: (221620, 109)
Target Vector: (221620,)
Target Classes: [0 1]


In [15]:
print("Target Distribution: ", y.value_counts()) # used to evaluate class imbalance 

Target Distribution:  scored
1    114453
0    107167
Name: count, dtype: int64


In [16]:
# splitting the dataset
# 70% training, 20% validation, 10% testing 

print('DATA SPLITTING')

# splitting into 30% and 70%, 30% will be broken down into 20% and 10%
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=1/3,        
    random_state=42,
    stratify=y_temp
)

print("Training Set Size: ", X_train.shape)
print("Validation Set Size: ", X_val.shape)
print("Testing Set Size: ", X_test.shape)

DATA SPLITTING
Training Set Size:  (155134, 109)
Validation Set Size:  (44324, 109)
Testing Set Size:  (22162, 109)


In [20]:
# normalizing the data using Z-normalization (standardization)

print("DATA NORMALIZATION")

scaler = StandardScaler()

# fit scaler to and transform training set (to prevent data leakage)
X_train_scaled = scaler.fit_transform(X_train)

# transform validation and testing sets 
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"Training Set Mean: {X_train_scaled.mean():.6f}")
print(f"Training Set Standard Deviation: {X_train_scaled.std():.6f}")

DATA NORMALIZATION
Training Set Mean: -0.000000
Training Set Standard Deviation: 0.995402


In [None]:
# training the models 