Skip to content

Commit

Permalink
Added Functionality to combine the pbp and shifts
Browse files Browse the repository at this point in the history
  • Loading branch information
HarryShomer committed Aug 21, 2018
1 parent c8427ec commit 2451cf8
Show file tree
Hide file tree
Showing 8 changed files with 197 additions and 3 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file modified hockey_scraper/.DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion hockey_scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from hockey_scraper.scrape_functions import scrape_games, scrape_date_range, scrape_seasons
from hockey_scraper.scrape_functions import scrape_games, scrape_date_range, scrape_seasons, merge_pbp_shifts



138 changes: 138 additions & 0 deletions hockey_scraper/combine_pbp_shifts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import pandas as pd
import hockey_scraper.shared as shared


def group_shifts_cols(shifts, type_group_cols):
"""
Group into columns for players by some column subset
:param shifts: DataFrame of shifts
:param type_group_cols: Some columns -> Either for On or Off
:return: Grouped DataFrame
"""
# Group both by player and player id get a new columns with a list of the group
# The "Player" and "Player_Id" column contain a list of the grouped up players/player_ids
grouped_df_player = shifts.groupby(by=type_group_cols, as_index=False)['Player'].apply(list).reset_index()
grouped_df_playerid = shifts.groupby(by=type_group_cols, as_index=False)['Player_Id'].apply(list).reset_index()

# Rename from nothing to something
grouped_df_player = grouped_df_player.rename(index=str, columns={0: 'player'})
grouped_df_playerid = grouped_df_playerid.rename(index=str, columns={0: 'player_Id'})

# Player and Player Id are done separately above bec. they wouldn't work together
# So just did both and slid over the relevant columns here
grouped_df_player['player_Id'] = grouped_df_playerid['player_Id']

# Rename either Start or End to Seconds Elapsed
grouped_df_player = grouped_df_player.rename(index=str, columns={type_group_cols[-1:][0]: "Seconds_Elapsed"})
grouped_df_player['Event'] = 'On' if type_group_cols[-1:][0] == "Start" else "Off"

return grouped_df_player


def group_shifts_type(shifts, player_cols, player_id_cols):
"""
Groups rows by players getting "On" and players getting "Off"
:param shifts: Shifts_df
:param player_cols: Columns for players (see previous function)
:param player_id_cols: Column for player ids' (see previous functions)
:return: Shifts DataFrame grouped by players on and off every second
"""
# To subset for On and Off shifts
group_cols_start = ['Game_Id', 'Period', 'Team', 'Home_Team', 'Away_Team', 'Date', 'Start']
group_cols_end = ['Game_Id', 'Period', 'Team', 'Home_Team', 'Away_Team', 'Date', 'End']

# Group by two type of column list above and then combine the two
# Now have rows for On and rows for Off
grouped_df_on = group_shifts_cols(shifts, group_cols_start)
grouped_df_off = group_shifts_cols(shifts, group_cols_end)
grouped_df = grouped_df_on.append(grouped_df_off)

# Convert the Column which contain a list to the appropriate columns for both player and player_id
players = pd.DataFrame(grouped_df.player.values.tolist(), index=grouped_df.index).rename(
columns=lambda x: 'Player{}'.format(x + 1))
player_ids = pd.DataFrame(grouped_df.player_Id.values.tolist(), index=grouped_df.index).rename(
columns=lambda x: 'Player{}_id'.format(x + 1))

# There are sometimes more than 6 players coming on at a time...it's not my problem (it's rare enough)
grouped_df[player_cols] = players[['Player1', 'Player2', 'Player3', 'Player4', 'Player5', 'Player6']]
grouped_df[player_id_cols] = player_ids[['Player1_id', 'Player2_id', 'Player3_id', 'Player4_id', 'Player5_id', 'Player6_id']]

# Not needed anymore since we converted to new columns
grouped_df = grouped_df.drop(['player', 'player_Id'], axis=1)

# Convert
#grouped_df[player_id_cols] = grouped_df[player_id_cols].apply(pd.to_numeric, errors='coerce')

return grouped_df.reset_index(drop=True)


def group_shifts(games_df, shifts):
"""
As of now the shifts are 1 player per row. This groups by team by type (on/off) by second. So at the beginning of
the game we'll have one row with 6 players coming on for the home team and the same row for the away team.
:param games_df: DataFrame containing Game_Id, Home_Team, and Away_Team -> Shifts_df doesn't contains home/away
:param shifts: DataFrame of Shifts
:return: Grouped Shifts DataFrame
"""
# Up to 6 players on and off any time
player_cols = [''.join(['Player', str(num)]) for num in range(1, 7)]
player_id_cols = [''.join(['Player', str(num), '_id']) for num in range(1, 7)]

# Merge in Home/Away Teams
shifts = pd.merge(shifts, games_df, on=['Game_Id'])

# Groups into on and off shift rows
grouped_df = group_shifts_type(shifts, player_cols, player_id_cols)

# Separate home and away for the purpose of the player columns (read below for more info)
grouped_df_home = grouped_df[grouped_df.Team == grouped_df.Home_Team]
grouped_df_away = grouped_df[grouped_df.Team == grouped_df.Away_Team]

# Rename Players columns into both home and away
# As on now it's player1, player1_id...etc.
# To merge into the pbp we need to append home and away for the appropriate players
# So we separate them and rename them with a "home" for the home teams and "away" for away teams
grouped_df_home = grouped_df_home.rename(index=str, columns={col: 'home' + col for col in player_cols})
grouped_df_home = grouped_df_home.rename(index=str, columns={col: 'home' + col for col in player_id_cols})
grouped_df_away = grouped_df_away.rename(index=str, columns={col: 'away' + col for col in player_cols})
grouped_df_away = grouped_df_away.rename(index=str, columns={col: 'away' + col for col in player_id_cols})

return grouped_df_home.append(grouped_df_away).reset_index(drop=True)


def merge_pbp_shifts(pbp_df, shifts_df):
"""
Merge the shifts_df into the pbp_df
:param pbp_df: Play by Play DataFrame
:param shifts_df: Shift Tables DataFrame
:return: Play by Play DataFrame with shift info embedded
"""
# To get the final pbp columns in the "correct" order
pbp_columns = pbp_df.columns

shifts_df['Player_Id'] = shifts_df['Player_Id'].astype(int)

# Get unique game_id -> teams pair for placing in Shifts_df
pbp_unique = pbp_df.drop_duplicates(subset=['Game_Id', 'Home_Team', 'Away_Team'])[['Game_Id', 'Home_Team', 'Away_Team']]

new_shifts = group_shifts(pbp_unique, shifts_df)

# Sort placing Off before On for same second
new_shifts = new_shifts.sort_values(by=['Game_Id', 'Period', 'Seconds_Elapsed', 'Event'])

# Replace Nan with None
new_shifts = new_shifts.where((pd.notnull(new_shifts)), None)

# Get Time Elapsed for shifts
new_shifts['Time_Elapsed'] = new_shifts.apply(lambda x: shared.convert_to_time(x['Seconds_Elapsed']), axis=1)

new_pbp = pbp_df.append(new_shifts).reset_index(drop=True).sort_values(by=['Game_Id', 'Period', 'Seconds_Elapsed', 'Event'])
return new_pbp[pbp_columns]
3 changes: 2 additions & 1 deletion hockey_scraper/json_shifts.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@ def scrape_game(game_id):
"""
shifts_json = get_shifts(game_id)

if not shifts_json:
# Total is the total number of shifts recorded...I just chose 25
if not shifts_json or shifts_json['total'] < 25:
print("Json shifts for game {} is either not there or can't be obtained".format(game_id))
return None

Expand Down
40 changes: 40 additions & 0 deletions hockey_scraper/scrape_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@

import hockey_scraper.json_schedule as json_schedule
import hockey_scraper.game_scraper as game_scraper
import hockey_scraper.combine_pbp_shifts as combine_pbp_shifts
import hockey_scraper.shared as shared
import pandas as pd
import time
import random
import os


# This hold the scraping errors in a string format.
Expand Down Expand Up @@ -104,6 +106,11 @@ def to_csv(file_name, pbp_df, shifts_df):
:return: None
"""
# TODO: Deposit in docs_dir if exist (use os.path.exist..or whatever)
if shared.docs_dir:
pbp_file = os.path.join(shared.docs_dir, 'nhl_pbp{}.csv'.format(file_name))
shifts_file = os.path.join(shared.docs_dir, 'nhl_shifts{}.csv'.format(file_name))

if pbp_df is not None:
print("\nPbp data deposited in file - " + 'nhl_pbp{}.csv'.format(file_name))
pbp_df.to_csv('nhl_pbp{}.csv'.format(file_name), sep=',', encoding='utf-8')
Expand Down Expand Up @@ -263,6 +270,39 @@ def scrape_games(games, if_scrape_shifts, data_format='csv', rescrape=False, doc
"errors": errors}


def merge_pbp_shifts(pbp_df, shifts_df):
"""
Merge the Shifts into the Play by Play
Note: Only works on season level
:param pbp_df: DataFrame of PBP data
:param shifts_df: DataFrame of Shift data
:return: DataFrame of pbp with shift info embedded
"""
# Must both be DataFrames
if not (isinstance(pbp_df, pd.DataFrame) and isinstance(shifts_df, pd.DataFrame)):
raise shared.HaltException("Both Arguments must be DataFrames")

# First must be pbp and second must be shifts
if not ('Event' in pbp_df.columns and 'Start' in shifts_df.columns):
raise shared.HaltException('Incorrect DataFrames given. The first argument must be the pbp DataFrame and the '
'second the shifts.')

# Check if same games in both
if set(pbp_df['Game_Id'].values.tolist()) != set(shifts_df['Game_Id'].values.tolist()):
raise shared.HaltException("Both DataFrames must contain information from the same set of games")

# Check if games in same season
# Get set of Game_Id/Date and just Game_Id -> If dates don't matter they'll be equal
game_unique = pbp_df.drop_duplicates(subset=['Game_Id', 'Date'])[['Game_Id', 'Date']].values.tolist()
game_id_unique = pbp_df.drop_duplicates(subset=['Game_Id'])['Game_Id'].values.tolist()
if len(game_unique) != len(game_id_unique):
raise shared.HaltException("merge_pbp_shifts only works on sets of games of the same season.")

return combine_pbp_shifts.merge_pbp_shifts(pbp_df, shifts_df)




Expand Down
15 changes: 15 additions & 0 deletions hockey_scraper/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import os
import time
from math import floor
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
Expand Down Expand Up @@ -182,6 +183,20 @@ def convert_to_seconds(minutes):
return datetime.timedelta(hours=x.tm_hour, minutes=x.tm_min, seconds=x.tm_sec).total_seconds()


def convert_to_time(seconds_elapsed):
"""
Convert from seconds elapsed to M:S
:param seconds_elapsed: 0 - 1200
:return: Time -> ex: 5:30
"""
minutes = floor(seconds_elapsed/60)
seconds = seconds_elapsed - floor(seconds_elapsed / 60) * 60

return ':'.join([str(minutes), str(int(seconds))])


def scrape_page(url):
"""
Scrape a given url
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def read():

setup(
name='hockey_scraper',
version='1.2.6',
version='1.2.6.1',
description="""This package is designed to allow people to scrape Play by Play and Shift data off of the National
Hockey League (NHL) API and website for all preseason, regular season and playoff games since the
2007-2008 season""",
Expand Down

0 comments on commit 2451cf8

Please sign in to comment.