-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added Functionality to combine the pbp and shifts
- Loading branch information
1 parent
c8427ec
commit 2451cf8
Showing
8 changed files
with
197 additions
and
3 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from hockey_scraper.scrape_functions import scrape_games, scrape_date_range, scrape_seasons | ||
from hockey_scraper.scrape_functions import scrape_games, scrape_date_range, scrape_seasons, merge_pbp_shifts | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import pandas as pd | ||
import hockey_scraper.shared as shared | ||
|
||
|
||
def group_shifts_cols(shifts, type_group_cols): | ||
""" | ||
Group into columns for players by some column subset | ||
:param shifts: DataFrame of shifts | ||
:param type_group_cols: Some columns -> Either for On or Off | ||
:return: Grouped DataFrame | ||
""" | ||
# Group both by player and player id get a new columns with a list of the group | ||
# The "Player" and "Player_Id" column contain a list of the grouped up players/player_ids | ||
grouped_df_player = shifts.groupby(by=type_group_cols, as_index=False)['Player'].apply(list).reset_index() | ||
grouped_df_playerid = shifts.groupby(by=type_group_cols, as_index=False)['Player_Id'].apply(list).reset_index() | ||
|
||
# Rename from nothing to something | ||
grouped_df_player = grouped_df_player.rename(index=str, columns={0: 'player'}) | ||
grouped_df_playerid = grouped_df_playerid.rename(index=str, columns={0: 'player_Id'}) | ||
|
||
# Player and Player Id are done separately above bec. they wouldn't work together | ||
# So just did both and slid over the relevant columns here | ||
grouped_df_player['player_Id'] = grouped_df_playerid['player_Id'] | ||
|
||
# Rename either Start or End to Seconds Elapsed | ||
grouped_df_player = grouped_df_player.rename(index=str, columns={type_group_cols[-1:][0]: "Seconds_Elapsed"}) | ||
grouped_df_player['Event'] = 'On' if type_group_cols[-1:][0] == "Start" else "Off" | ||
|
||
return grouped_df_player | ||
|
||
|
||
def group_shifts_type(shifts, player_cols, player_id_cols): | ||
""" | ||
Groups rows by players getting "On" and players getting "Off" | ||
:param shifts: Shifts_df | ||
:param player_cols: Columns for players (see previous function) | ||
:param player_id_cols: Column for player ids' (see previous functions) | ||
:return: Shifts DataFrame grouped by players on and off every second | ||
""" | ||
# To subset for On and Off shifts | ||
group_cols_start = ['Game_Id', 'Period', 'Team', 'Home_Team', 'Away_Team', 'Date', 'Start'] | ||
group_cols_end = ['Game_Id', 'Period', 'Team', 'Home_Team', 'Away_Team', 'Date', 'End'] | ||
|
||
# Group by two type of column list above and then combine the two | ||
# Now have rows for On and rows for Off | ||
grouped_df_on = group_shifts_cols(shifts, group_cols_start) | ||
grouped_df_off = group_shifts_cols(shifts, group_cols_end) | ||
grouped_df = grouped_df_on.append(grouped_df_off) | ||
|
||
# Convert the Column which contain a list to the appropriate columns for both player and player_id | ||
players = pd.DataFrame(grouped_df.player.values.tolist(), index=grouped_df.index).rename( | ||
columns=lambda x: 'Player{}'.format(x + 1)) | ||
player_ids = pd.DataFrame(grouped_df.player_Id.values.tolist(), index=grouped_df.index).rename( | ||
columns=lambda x: 'Player{}_id'.format(x + 1)) | ||
|
||
# There are sometimes more than 6 players coming on at a time...it's not my problem (it's rare enough) | ||
grouped_df[player_cols] = players[['Player1', 'Player2', 'Player3', 'Player4', 'Player5', 'Player6']] | ||
grouped_df[player_id_cols] = player_ids[['Player1_id', 'Player2_id', 'Player3_id', 'Player4_id', 'Player5_id', 'Player6_id']] | ||
|
||
# Not needed anymore since we converted to new columns | ||
grouped_df = grouped_df.drop(['player', 'player_Id'], axis=1) | ||
|
||
# Convert | ||
#grouped_df[player_id_cols] = grouped_df[player_id_cols].apply(pd.to_numeric, errors='coerce') | ||
|
||
return grouped_df.reset_index(drop=True) | ||
|
||
|
||
def group_shifts(games_df, shifts): | ||
""" | ||
As of now the shifts are 1 player per row. This groups by team by type (on/off) by second. So at the beginning of | ||
the game we'll have one row with 6 players coming on for the home team and the same row for the away team. | ||
:param games_df: DataFrame containing Game_Id, Home_Team, and Away_Team -> Shifts_df doesn't contains home/away | ||
:param shifts: DataFrame of Shifts | ||
:return: Grouped Shifts DataFrame | ||
""" | ||
# Up to 6 players on and off any time | ||
player_cols = [''.join(['Player', str(num)]) for num in range(1, 7)] | ||
player_id_cols = [''.join(['Player', str(num), '_id']) for num in range(1, 7)] | ||
|
||
# Merge in Home/Away Teams | ||
shifts = pd.merge(shifts, games_df, on=['Game_Id']) | ||
|
||
# Groups into on and off shift rows | ||
grouped_df = group_shifts_type(shifts, player_cols, player_id_cols) | ||
|
||
# Separate home and away for the purpose of the player columns (read below for more info) | ||
grouped_df_home = grouped_df[grouped_df.Team == grouped_df.Home_Team] | ||
grouped_df_away = grouped_df[grouped_df.Team == grouped_df.Away_Team] | ||
|
||
# Rename Players columns into both home and away | ||
# As on now it's player1, player1_id...etc. | ||
# To merge into the pbp we need to append home and away for the appropriate players | ||
# So we separate them and rename them with a "home" for the home teams and "away" for away teams | ||
grouped_df_home = grouped_df_home.rename(index=str, columns={col: 'home' + col for col in player_cols}) | ||
grouped_df_home = grouped_df_home.rename(index=str, columns={col: 'home' + col for col in player_id_cols}) | ||
grouped_df_away = grouped_df_away.rename(index=str, columns={col: 'away' + col for col in player_cols}) | ||
grouped_df_away = grouped_df_away.rename(index=str, columns={col: 'away' + col for col in player_id_cols}) | ||
|
||
return grouped_df_home.append(grouped_df_away).reset_index(drop=True) | ||
|
||
|
||
def merge_pbp_shifts(pbp_df, shifts_df): | ||
""" | ||
Merge the shifts_df into the pbp_df | ||
:param pbp_df: Play by Play DataFrame | ||
:param shifts_df: Shift Tables DataFrame | ||
:return: Play by Play DataFrame with shift info embedded | ||
""" | ||
# To get the final pbp columns in the "correct" order | ||
pbp_columns = pbp_df.columns | ||
|
||
shifts_df['Player_Id'] = shifts_df['Player_Id'].astype(int) | ||
|
||
# Get unique game_id -> teams pair for placing in Shifts_df | ||
pbp_unique = pbp_df.drop_duplicates(subset=['Game_Id', 'Home_Team', 'Away_Team'])[['Game_Id', 'Home_Team', 'Away_Team']] | ||
|
||
new_shifts = group_shifts(pbp_unique, shifts_df) | ||
|
||
# Sort placing Off before On for same second | ||
new_shifts = new_shifts.sort_values(by=['Game_Id', 'Period', 'Seconds_Elapsed', 'Event']) | ||
|
||
# Replace Nan with None | ||
new_shifts = new_shifts.where((pd.notnull(new_shifts)), None) | ||
|
||
# Get Time Elapsed for shifts | ||
new_shifts['Time_Elapsed'] = new_shifts.apply(lambda x: shared.convert_to_time(x['Seconds_Elapsed']), axis=1) | ||
|
||
new_pbp = pbp_df.append(new_shifts).reset_index(drop=True).sort_values(by=['Game_Id', 'Period', 'Seconds_Elapsed', 'Event']) | ||
return new_pbp[pbp_columns] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters