Skip to content

Commit

Permalink
Ensure type of seconds column for the pbp data is a float
Browse files Browse the repository at this point in the history
  • Loading branch information
HarryShomer committed Oct 13, 2019
1 parent fbbdd31 commit c96019c
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 9 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.DS_Store
.idea/
.csv/
tests.py
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ Here is a simple example of a way to setup live scraping. I strongly suggest che


.. NWHL Usage
----------
.. -------------
.. Scrape data on a season by season level:
Expand Down
9 changes: 3 additions & 6 deletions hockey_scraper/nhl/game_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,6 @@ def combine_html_json_pbp(json_df, html_df, game_id, date):
json_df = json_df.drop(['p1_name', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID'], axis=1)

try:
html_df.Period = html_df.Period.astype(int)

# If they aren't equal it's usually due to the HTML containing a challenge event
if html_df.shape[0] == json_df.shape[0]:
json_df = json_df[['period', 'event', 'seconds_elapsed', 'xC', 'yC']]
Expand All @@ -158,12 +156,12 @@ def combine_html_json_pbp(json_df, html_df, game_id, date):
# We always merge if they aren't equal but we check if it's due to a challenge so we can print out a better
# warning message for the user.
# NOTE: May be slightly incorrect. It's possible for there to be a challenge and another issue for one game.
if'CHL' in list(html_df.Event):
shared.print_warning("The number of columns in the Html and Json pbp are different because the"
if 'CHL' in list(html_df.Event):
shared.print_warning("The number of rows in the Html and Json pbp are different because the"
" Json pbp, for some reason, does not include challenges. Will instead merge on "
"Period, Event, Time, and p1_id.")
else:
shared.print_warning("The number of columns in the Html and json pbp are different because "
shared.print_warning("The number of rows in the Html and json pbp are different because "
"someone fucked up. Will instead merge on Period, Event, Time, and p1_id.")

# Actual Merging
Expand Down Expand Up @@ -199,7 +197,6 @@ def combine_espn_html_pbp(html_df, espn_df, game_id, date, away_team, home_team)
"""
if espn_df is not None:
try:
espn_df.period = espn_df.period.astype(int)
game_df = pd.merge(html_df, espn_df, left_on=['Period', 'Seconds_Elapsed', 'Event'],
right_on=['period', 'time_elapsed', 'event'], how='left')

Expand Down
5 changes: 5 additions & 0 deletions hockey_scraper/nhl/pbp/espn_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,4 +235,9 @@ def scrape_game(date, home_team, away_team, game_id=None):
shared.print_warning("Error parsing Espn pbp for game {a} {b} {c} {d}".format(a=date, b=home_team, c=away_team, d=e))
return None

espn_df.period = espn_df.period.astype(int)

return espn_df



4 changes: 4 additions & 0 deletions hockey_scraper/nhl/pbp/html_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,10 @@ def scrape_pbp(game_html, game_id, players, teams):
shared.print_warning('Error parsing Html pbp for game {} {}'.format(game_id, e))
return None

# These sometimes end up as objects
game_df.Period = game_df.Period.astype(int)
game_df.Seconds_Elapsed = game_df.Seconds_Elapsed.astype(float)

return game_df


Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def read():

setup(
name='hockey_scraper',
version='1.34',
version='1.34.1',
description="""This package is designed to allow one to scrape the raw data for both the National Hockey League
(NHL) and the National Women's Hockey League (NWHL) off of their respective API and websites.""",
long_description=read(),
Expand All @@ -24,6 +24,6 @@ def read():
author_email='Harryshomer@gmail.com',
license='MIT',
packages=find_packages(),
install_requires=['BeautifulSoup4', 'requests', 'lxml', 'html5lib', 'pandas', 'sphinx', 'pytest'],
install_requires=['BeautifulSoup4', 'requests', 'lxml', 'html5lib', 'pandas', 'pytest'],
zip_safe=False
)

0 comments on commit c96019c

Please sign in to comment.