diff --git a/.DS_Store b/.DS_Store index 304ee08..4c545fb 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/hockey_scraper/.DS_Store b/hockey_scraper/.DS_Store index c298b27..a91e06f 100755 Binary files a/hockey_scraper/.DS_Store and b/hockey_scraper/.DS_Store differ diff --git a/hockey_scraper/__init__.py b/hockey_scraper/__init__.py index 824b4a1..503b8cf 100755 --- a/hockey_scraper/__init__.py +++ b/hockey_scraper/__init__.py @@ -1,4 +1,4 @@ -from hockey_scraper.scrape_functions import scrape_games, scrape_date_range, scrape_seasons +from hockey_scraper.scrape_functions import scrape_games, scrape_date_range, scrape_seasons, merge_pbp_shifts diff --git a/hockey_scraper/combine_pbp_shifts.py b/hockey_scraper/combine_pbp_shifts.py new file mode 100644 index 0000000..7259c02 --- /dev/null +++ b/hockey_scraper/combine_pbp_shifts.py @@ -0,0 +1,138 @@ +import pandas as pd +import hockey_scraper.shared as shared + + +def group_shifts_cols(shifts, type_group_cols): + """ + Group into columns for players by some column subset + + :param shifts: DataFrame of shifts + :param type_group_cols: Some columns -> Either for On or Off + + :return: Grouped DataFrame + """ + # Group both by player and player id get a new columns with a list of the group + # The "Player" and "Player_Id" column contain a list of the grouped up players/player_ids + grouped_df_player = shifts.groupby(by=type_group_cols, as_index=False)['Player'].apply(list).reset_index() + grouped_df_playerid = shifts.groupby(by=type_group_cols, as_index=False)['Player_Id'].apply(list).reset_index() + + # Rename from nothing to something + grouped_df_player = grouped_df_player.rename(index=str, columns={0: 'player'}) + grouped_df_playerid = grouped_df_playerid.rename(index=str, columns={0: 'player_Id'}) + + # Player and Player Id are done separately above bec. they wouldn't work together + # So just did both and slid over the relevant columns here + grouped_df_player['player_Id'] = grouped_df_playerid['player_Id'] + + # Rename either Start or End to Seconds Elapsed + grouped_df_player = grouped_df_player.rename(index=str, columns={type_group_cols[-1:][0]: "Seconds_Elapsed"}) + grouped_df_player['Event'] = 'On' if type_group_cols[-1:][0] == "Start" else "Off" + + return grouped_df_player + + +def group_shifts_type(shifts, player_cols, player_id_cols): + """ + Groups rows by players getting "On" and players getting "Off" + + :param shifts: Shifts_df + :param player_cols: Columns for players (see previous function) + :param player_id_cols: Column for player ids' (see previous functions) + + :return: Shifts DataFrame grouped by players on and off every second + """ + # To subset for On and Off shifts + group_cols_start = ['Game_Id', 'Period', 'Team', 'Home_Team', 'Away_Team', 'Date', 'Start'] + group_cols_end = ['Game_Id', 'Period', 'Team', 'Home_Team', 'Away_Team', 'Date', 'End'] + + # Group by two type of column list above and then combine the two + # Now have rows for On and rows for Off + grouped_df_on = group_shifts_cols(shifts, group_cols_start) + grouped_df_off = group_shifts_cols(shifts, group_cols_end) + grouped_df = grouped_df_on.append(grouped_df_off) + + # Convert the Column which contain a list to the appropriate columns for both player and player_id + players = pd.DataFrame(grouped_df.player.values.tolist(), index=grouped_df.index).rename( + columns=lambda x: 'Player{}'.format(x + 1)) + player_ids = pd.DataFrame(grouped_df.player_Id.values.tolist(), index=grouped_df.index).rename( + columns=lambda x: 'Player{}_id'.format(x + 1)) + + # There are sometimes more than 6 players coming on at a time...it's not my problem (it's rare enough) + grouped_df[player_cols] = players[['Player1', 'Player2', 'Player3', 'Player4', 'Player5', 'Player6']] + grouped_df[player_id_cols] = player_ids[['Player1_id', 'Player2_id', 'Player3_id', 'Player4_id', 'Player5_id', 'Player6_id']] + + # Not needed anymore since we converted to new columns + grouped_df = grouped_df.drop(['player', 'player_Id'], axis=1) + + # Convert + #grouped_df[player_id_cols] = grouped_df[player_id_cols].apply(pd.to_numeric, errors='coerce') + + return grouped_df.reset_index(drop=True) + + +def group_shifts(games_df, shifts): + """ + As of now the shifts are 1 player per row. This groups by team by type (on/off) by second. So at the beginning of + the game we'll have one row with 6 players coming on for the home team and the same row for the away team. + + :param games_df: DataFrame containing Game_Id, Home_Team, and Away_Team -> Shifts_df doesn't contains home/away + :param shifts: DataFrame of Shifts + + :return: Grouped Shifts DataFrame + """ + # Up to 6 players on and off any time + player_cols = [''.join(['Player', str(num)]) for num in range(1, 7)] + player_id_cols = [''.join(['Player', str(num), '_id']) for num in range(1, 7)] + + # Merge in Home/Away Teams + shifts = pd.merge(shifts, games_df, on=['Game_Id']) + + # Groups into on and off shift rows + grouped_df = group_shifts_type(shifts, player_cols, player_id_cols) + + # Separate home and away for the purpose of the player columns (read below for more info) + grouped_df_home = grouped_df[grouped_df.Team == grouped_df.Home_Team] + grouped_df_away = grouped_df[grouped_df.Team == grouped_df.Away_Team] + + # Rename Players columns into both home and away + # As on now it's player1, player1_id...etc. + # To merge into the pbp we need to append home and away for the appropriate players + # So we separate them and rename them with a "home" for the home teams and "away" for away teams + grouped_df_home = grouped_df_home.rename(index=str, columns={col: 'home' + col for col in player_cols}) + grouped_df_home = grouped_df_home.rename(index=str, columns={col: 'home' + col for col in player_id_cols}) + grouped_df_away = grouped_df_away.rename(index=str, columns={col: 'away' + col for col in player_cols}) + grouped_df_away = grouped_df_away.rename(index=str, columns={col: 'away' + col for col in player_id_cols}) + + return grouped_df_home.append(grouped_df_away).reset_index(drop=True) + + +def merge_pbp_shifts(pbp_df, shifts_df): + """ + Merge the shifts_df into the pbp_df + + :param pbp_df: Play by Play DataFrame + :param shifts_df: Shift Tables DataFrame + + :return: Play by Play DataFrame with shift info embedded + """ + # To get the final pbp columns in the "correct" order + pbp_columns = pbp_df.columns + + shifts_df['Player_Id'] = shifts_df['Player_Id'].astype(int) + + # Get unique game_id -> teams pair for placing in Shifts_df + pbp_unique = pbp_df.drop_duplicates(subset=['Game_Id', 'Home_Team', 'Away_Team'])[['Game_Id', 'Home_Team', 'Away_Team']] + + new_shifts = group_shifts(pbp_unique, shifts_df) + + # Sort placing Off before On for same second + new_shifts = new_shifts.sort_values(by=['Game_Id', 'Period', 'Seconds_Elapsed', 'Event']) + + # Replace Nan with None + new_shifts = new_shifts.where((pd.notnull(new_shifts)), None) + + # Get Time Elapsed for shifts + new_shifts['Time_Elapsed'] = new_shifts.apply(lambda x: shared.convert_to_time(x['Seconds_Elapsed']), axis=1) + + new_pbp = pbp_df.append(new_shifts).reset_index(drop=True).sort_values(by=['Game_Id', 'Period', 'Seconds_Elapsed', 'Event']) + return new_pbp[pbp_columns] diff --git a/hockey_scraper/json_shifts.py b/hockey_scraper/json_shifts.py index 9bc3545..5c87ae2 100755 --- a/hockey_scraper/json_shifts.py +++ b/hockey_scraper/json_shifts.py @@ -113,7 +113,8 @@ def scrape_game(game_id): """ shifts_json = get_shifts(game_id) - if not shifts_json: + # Total is the total number of shifts recorded...I just chose 25 + if not shifts_json or shifts_json['total'] < 25: print("Json shifts for game {} is either not there or can't be obtained".format(game_id)) return None diff --git a/hockey_scraper/scrape_functions.py b/hockey_scraper/scrape_functions.py index 199923c..eeefd00 100755 --- a/hockey_scraper/scrape_functions.py +++ b/hockey_scraper/scrape_functions.py @@ -4,10 +4,12 @@ import hockey_scraper.json_schedule as json_schedule import hockey_scraper.game_scraper as game_scraper +import hockey_scraper.combine_pbp_shifts as combine_pbp_shifts import hockey_scraper.shared as shared import pandas as pd import time import random +import os # This hold the scraping errors in a string format. @@ -104,6 +106,11 @@ def to_csv(file_name, pbp_df, shifts_df): :return: None """ + # TODO: Deposit in docs_dir if exist (use os.path.exist..or whatever) + if shared.docs_dir: + pbp_file = os.path.join(shared.docs_dir, 'nhl_pbp{}.csv'.format(file_name)) + shifts_file = os.path.join(shared.docs_dir, 'nhl_shifts{}.csv'.format(file_name)) + if pbp_df is not None: print("\nPbp data deposited in file - " + 'nhl_pbp{}.csv'.format(file_name)) pbp_df.to_csv('nhl_pbp{}.csv'.format(file_name), sep=',', encoding='utf-8') @@ -263,6 +270,39 @@ def scrape_games(games, if_scrape_shifts, data_format='csv', rescrape=False, doc "errors": errors} +def merge_pbp_shifts(pbp_df, shifts_df): + """ + Merge the Shifts into the Play by Play + + Note: Only works on season level + + :param pbp_df: DataFrame of PBP data + :param shifts_df: DataFrame of Shift data + + :return: DataFrame of pbp with shift info embedded + """ + # Must both be DataFrames + if not (isinstance(pbp_df, pd.DataFrame) and isinstance(shifts_df, pd.DataFrame)): + raise shared.HaltException("Both Arguments must be DataFrames") + + # First must be pbp and second must be shifts + if not ('Event' in pbp_df.columns and 'Start' in shifts_df.columns): + raise shared.HaltException('Incorrect DataFrames given. The first argument must be the pbp DataFrame and the ' + 'second the shifts.') + + # Check if same games in both + if set(pbp_df['Game_Id'].values.tolist()) != set(shifts_df['Game_Id'].values.tolist()): + raise shared.HaltException("Both DataFrames must contain information from the same set of games") + + # Check if games in same season + # Get set of Game_Id/Date and just Game_Id -> If dates don't matter they'll be equal + game_unique = pbp_df.drop_duplicates(subset=['Game_Id', 'Date'])[['Game_Id', 'Date']].values.tolist() + game_id_unique = pbp_df.drop_duplicates(subset=['Game_Id'])['Game_Id'].values.tolist() + if len(game_unique) != len(game_id_unique): + raise shared.HaltException("merge_pbp_shifts only works on sets of games of the same season.") + + return combine_pbp_shifts.merge_pbp_shifts(pbp_df, shifts_df) + diff --git a/hockey_scraper/shared.py b/hockey_scraper/shared.py index 2aeeb80..f93e6da 100755 --- a/hockey_scraper/shared.py +++ b/hockey_scraper/shared.py @@ -6,6 +6,7 @@ import os import time +from math import floor import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry @@ -182,6 +183,20 @@ def convert_to_seconds(minutes): return datetime.timedelta(hours=x.tm_hour, minutes=x.tm_min, seconds=x.tm_sec).total_seconds() +def convert_to_time(seconds_elapsed): + """ + Convert from seconds elapsed to M:S + + :param seconds_elapsed: 0 - 1200 + + :return: Time -> ex: 5:30 + """ + minutes = floor(seconds_elapsed/60) + seconds = seconds_elapsed - floor(seconds_elapsed / 60) * 60 + + return ':'.join([str(minutes), str(int(seconds))]) + + def scrape_page(url): """ Scrape a given url diff --git a/setup.py b/setup.py index 6606a3a..5cbfaf1 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ def read(): setup( name='hockey_scraper', - version='1.2.6', + version='1.2.6.1', description="""This package is designed to allow people to scrape Play by Play and Shift data off of the National Hockey League (NHL) API and website for all preseason, regular season and playoff games since the 2007-2008 season""",