In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import hockey_scraper
import pickle
import random



# Define Helper Functions

These are created to help dynamically scrape Natural Stat Trick, the NHL API, and create the features used in model as well as merge df's

In [2]:
def get_and_format_nst_team_stats(season, sit, rate):
    #dict to convert team names from nst to team abbreviations from the NHL API
    nst_to_sched = {'Anaheim Ducks': 'ANA',
                     'Arizona Coyotes': 'ARI',
                     'Boston Bruins': 'BOS',
                     'Buffalo Sabres': 'BUF',
                     'Calgary Flames': 'CGY',
                     'Carolina Hurricanes': 'CAR',
                     'Chicago Blackhawks': 'CHI',
                     'Colorado Avalanche': 'COL',
                     'Columbus Blue Jackets': 'CBJ',
                     'Dallas Stars': 'DAL',
                     'Detroit Red Wings': 'DET',
                     'Edmonton Oilers': 'EDM',
                     'Florida Panthers': 'FLA',
                     'Los Angeles Kings': 'L.A',
                     'Minnesota Wild': 'MIN',
                     'Montreal Canadiens': 'MTL',
                     'Nashville Predators': 'NSH',
                     'New Jersey Devils': 'N.J',
                     'New York Islanders': 'NYI',
                     'New York Rangers': 'NYR',
                     'Ottawa Senators': 'OTT',
                     'Philadelphia Flyers': 'PHI',
                     'Pittsburgh Penguins': 'PIT',
                     'San Jose Sharks': 'S.J',
                     'St Louis Blues': 'STL',
                     'Tampa Bay Lightning': 'T.B',
                     'Toronto Maple Leafs': 'TOR',
                     'Vancouver Canucks': 'VAN',
                     'Vegas Golden Knights': 'VGK',
                     'Washington Capitals': 'WSH',
                     'Winnipeg Jets': 'WPG'}
    #dynamic url
    url = 'https://www.naturalstattrick.com/games.php?fromseason={}&thruseason={}&stype=2&sit={}&loc=B&team=All&rate={}'.format(
        season,
        season,
        sit,
        rate)
    #scrape html table from webpage
    df = pd.read_html(url, header=0, index_col = 0, na_values=["-"])[0]
    #reset index
    df.reset_index(inplace = True)
    #format date
    df['Date'] = df['Game'].apply(lambda x: pd.to_datetime(x[0:10]))
    #add team game number
    df['Game_Number'] = df.groupby('Team').cumcount() + 1
    #replcate Team name with team abbreviation
    df = df.replace({'Team': nst_to_sched})
    #add team key to merge with game results df
    df['Team_Key'] = df['Team'].astype(str)+'_'+df['Date'].astype(str)
    return df


In [4]:
# Set option to display all columns
pd.set_option('display.max_columns', None)


When using the get_and_format_nst_team_stats function, season is just a string with both years that the season occurs in, for example the 2016-2017 season is "20162017". sit is just the situation that these statistics occur in. So if you want to see the statistics for the game during 5v5 play, the sit is "5v5". Lastly, you can either see these statistics as a cumulative count or a rate that adjusts them to a per 60 min rate. 'n' is for count and 'y' is for rate. 

In [5]:
get_and_format_nst_team_stats('20162017','5v5', 'n')

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,SF%,GF,GA,GF%,xGF,xGA,xGF%,SCF,SCA,SCF%,HDCF,HDCA,HDCF%,HDSF,HDSA,HDSF%,HDGF,HDGA,HDGF%,HDSH%,HDSV%,MDCF,MDCA,MDCF%,MDSF,MDSA,MDSF%,MDGF,MDGA,MDGF%,MDSH%,MDSV%,LDCF,LDCA,LDCF%,LDSF,LDSA,LDSF%,LDGF,LDGA,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Date,Game_Number,Team_Key
0,"2016-10-12 - Maple Leafs 4, Senators 5",OTT,Limited Report Full Report,46:00,36,68,34.62,31,45,40.79,22,31,41.51,4,4,50.00,1.47,2.13,40.77,15,25,37.50,6,10,37.50,4,5,44.44,1,2,33.33,25.00,60.00,9,15,37.50,5,4,55.56,2,1,66.67,40.00,75.00,18,33,35.29,12,21,36.36,1,1,50.0,8.33,95.24,18.18,87.10,1.053,17618,2016-10-12,1,OTT_2016-10-12
1,"2016-10-12 - Maple Leafs 4, Senators 5",TOR,Limited Report Full Report,46:00,68,36,65.38,45,31,59.21,31,22,58.49,4,4,50.00,2.13,1.47,59.23,25,15,62.50,10,6,62.50,5,4,55.56,2,1,66.67,40.00,75.00,15,9,62.50,4,5,44.44,1,2,33.33,25.00,60.00,33,18,64.71,21,12,63.64,1,1,50.0,4.76,91.67,12.90,81.82,0.947,17618,2016-10-12,1,TOR_2016-10-12
2,"2016-10-12 - Blues 5, Blackhawks 2",CHI,Limited Report Full Report,43:54,27,34,44.26,15,27,35.71,11,19,36.67,1,0,100.00,0.84,1.89,30.71,11,20,35.48,5,8,38.46,4,7,36.36,1,0,100.00,25.00,100.00,6,12,33.33,2,7,22.22,0,0,,0.00,100.00,12,10,54.55,5,5,50.00,0,0,,0.00,100.00,9.09,100.00,1.091,21729,2016-10-12,1,CHI_2016-10-12
3,"2016-10-12 - Blues 5, Blackhawks 2",STL,Limited Report Full Report,43:54,34,27,55.74,27,15,64.29,19,11,63.33,0,1,0.00,1.89,0.84,69.29,20,11,64.52,8,5,61.54,7,4,63.64,0,1,0.00,0.00,75.00,12,6,66.67,7,2,77.78,0,0,,0.00,100.00,10,12,45.45,5,5,50.00,0,0,,0.00,100.00,0.00,90.91,0.909,21729,2016-10-12,1,STL_2016-10-12
4,"2016-10-12 - Flames 4, Oilers 7",CGY,Limited Report Full Report,40:51,43,34,55.84,35,28,55.56,26,22,54.17,1,4,20.00,1.65,1.85,47.14,12,18,40.00,4,11,26.67,4,11,26.67,1,3,25.00,25.00,72.73,8,7,53.33,7,3,70.00,0,1,0.00,0.00,66.67,29,14,67.44,14,7,66.67,0,0,,0.00,100.00,3.85,81.82,0.857,18347,2016-10-12,1,CGY_2016-10-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2455,"2017-04-09 - Panthers 2, Capitals 0",WSH,Limited Report Full Report,53:16,58,37,61.05,47,26,64.38,35,18,66.04,0,0,,2.60,1.78,59.37,26,21,55.32,10,7,58.82,6,5,54.55,0,0,,0.00,100.00,16,14,53.33,10,6,62.50,0,0,,0.00,100.00,30,13,69.77,19,7,73.08,0,0,,0.00,100.00,0.00,100.00,1.000,18506,2017-04-09,82,WSH_2017-04-09
2456,"2017-04-09 - Kings 3, Ducks 4",ANA,Limited Report Full Report,44:39,40,39,50.63,31,26,54.39,18,15,54.55,2,2,50.00,2.22,1.62,57.81,21,23,47.73,12,11,52.17,9,8,52.94,1,2,33.33,11.11,75.00,9,12,42.86,1,3,25.00,0,0,,0.00,100.00,19,14,57.58,8,4,66.67,1,0,100.0,12.50,100.00,11.11,86.67,0.978,16564,2017-04-09,82,ANA_2017-04-09
2457,"2017-04-09 - Kings 3, Ducks 4",L.A,Limited Report Full Report,44:39,39,40,49.37,26,31,45.61,15,18,45.45,2,2,50.00,1.62,2.22,42.19,23,21,52.27,11,12,47.83,8,9,47.06,2,1,66.67,25.00,88.89,12,9,57.14,3,1,75.00,0,0,,0.00,100.00,14,19,42.42,4,8,33.33,0,1,0.0,0.00,87.50,13.33,88.89,1.022,16564,2017-04-09,82,L.A_2017-04-09
2458,"2017-04-09 - Canucks 2, Oilers 5",EDM,Limited Report Full Report,49:50,57,38,60.00,49,26,65.33,40,17,70.18,4,2,66.67,3.51,1.51,69.88,34,19,64.15,16,6,72.73,13,3,81.25,3,1,75.00,23.08,66.67,18,13,58.06,11,4,73.33,1,0,100.00,9.09,100.00,22,15,59.46,16,9,64.00,0,1,0.0,0.00,88.89,10.00,88.24,0.982,18347,2017-04-09,82,EDM_2017-04-09


Now that we have our function to scrape season game data from NST, lets create a function to merge the primary dataframe obtained from the function with pp and pk dataframes. In other words, a function to merge 5v5, PP, and PK team game logs from NST. 

In [6]:
def merge_team_stats(primary_df, pp_df, pk_df):
    primary_df=primary_df.merge(pk_df[['Team_Key','TOI','xGA','xGF']], on='Team_Key', how='left', suffixes = ('', '_pk'))
    primary_df=primary_df.merge(pk_df[['Team_Key','TOI','xGA','xGF']], on='Team_Key', how='left', suffixes = ('', '_pp'))
    return primary_df

# Feature Engineering
