Ensure type of seconds column for the pbp data is a float

HarryShomer · Oct 13, 2019 · c96019c · c96019c
1 parent fbbdd31
commit c96019c
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+.DS_Store
 .idea/
 .csv/
 tests.py

diff --git a/README.rst b/README.rst
@@ -181,7 +181,7 @@ Here is a simple example of a way to setup live scraping. I strongly suggest che
 
 
 .. NWHL Usage
-----------
+.. -------------
 
 .. Scrape data on a season by season level:
 

diff --git a/hockey_scraper/nhl/game_scraper.py b/hockey_scraper/nhl/game_scraper.py
@@ -148,8 +148,6 @@ def combine_html_json_pbp(json_df, html_df, game_id, date):
     json_df = json_df.drop(['p1_name', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID'], axis=1)
 
     try:
-        html_df.Period = html_df.Period.astype(int)
-
         # If they aren't equal it's usually due to the HTML containing a challenge event
         if html_df.shape[0] == json_df.shape[0]:
             json_df = json_df[['period', 'event', 'seconds_elapsed', 'xC', 'yC']]
@@ -158,12 +156,12 @@ def combine_html_json_pbp(json_df, html_df, game_id, date):
             # We always merge if they aren't equal but we check if it's due to a challenge so we can print out a better
             # warning message for the user.
             # NOTE: May be slightly incorrect. It's possible for there to be a challenge and another issue for one game.
-            if'CHL' in list(html_df.Event):
-                shared.print_warning("The number of columns in the Html and Json pbp are different because the"
+            if 'CHL' in list(html_df.Event):
+                shared.print_warning("The number of rows in the Html and Json pbp are different because the"
                                      " Json pbp, for some reason, does not include challenges. Will instead merge on "
                                      "Period, Event, Time, and p1_id.")
             else:
-                shared.print_warning("The number of columns in the Html and json pbp are different because "
+                shared.print_warning("The number of rows in the Html and json pbp are different because "
                                      "someone fucked up. Will instead merge on Period, Event, Time, and p1_id.")
 
             # Actual Merging
@@ -199,7 +197,6 @@ def combine_espn_html_pbp(html_df, espn_df, game_id, date, away_team, home_team)
     """
     if espn_df is not None:
         try:
-            espn_df.period = espn_df.period.astype(int)
             game_df = pd.merge(html_df, espn_df, left_on=['Period', 'Seconds_Elapsed', 'Event'],
                                right_on=['period', 'time_elapsed', 'event'], how='left')
 

diff --git a/hockey_scraper/nhl/pbp/espn_pbp.py b/hockey_scraper/nhl/pbp/espn_pbp.py
@@ -235,4 +235,9 @@ def scrape_game(date, home_team, away_team, game_id=None):
         shared.print_warning("Error parsing Espn pbp for game {a} {b} {c} {d}".format(a=date, b=home_team, c=away_team, d=e))
         return None
 
+    espn_df.period = espn_df.period.astype(int)
+
     return espn_df
+
+
+
diff --git a/hockey_scraper/nhl/pbp/html_pbp.py b/hockey_scraper/nhl/pbp/html_pbp.py
@@ -808,6 +808,10 @@ def scrape_pbp(game_html, game_id, players, teams):
         shared.print_warning('Error parsing Html pbp for game {} {}'.format(game_id, e))
         return None
 
+    # These sometimes end up as objects
+    game_df.Period = game_df.Period.astype(int)
+    game_df.Seconds_Elapsed = game_df.Seconds_Elapsed.astype(float)
+
     return game_df
 
 

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@ def read():
 
 setup(
     name='hockey_scraper',
-    version='1.34',
+    version='1.34.1',
     description="""This package is designed to allow one to scrape the raw data for both the National Hockey League
                    (NHL) and the National Women's Hockey League (NWHL) off of their respective API and websites.""",
     long_description=read(),
@@ -24,6 +24,6 @@ def read():
     author_email='Harryshomer@gmail.com',
     license='MIT',
     packages=find_packages(),
-    install_requires=['BeautifulSoup4', 'requests', 'lxml', 'html5lib', 'pandas', 'sphinx', 'pytest'],
+    install_requires=['BeautifulSoup4', 'requests', 'lxml', 'html5lib', 'pandas', 'pytest'],
     zip_safe=False
 )