update nhl schedule to always rescrape the page. See the docstring fo…

…r nhl.json_schedule.get_schedule for more details
HarryShomer · Jan 25, 2021 · fb14534 · fb14534
1 parent eff06f3
commit fb14534
Show file tree

Hide file tree

Showing 8 changed files with 31 additions and 19 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -47,3 +47,4 @@ v1.37
   * Now saves scraped pages in docs_dir as a GZIP
   * Only print full error summary when the number of games scraped is >= 25
   * Remove hardcoded exception for Sebastian Aho. Updated process to work without it.
+  * Always rescrape schedule pages
diff --git a/hockey_scraper/__init__.py b/hockey_scraper/__init__.py
@@ -1,6 +1,7 @@
-#from .nwhl import scrape_functions as nwhl
 from .nhl.live_scrape import ScrapeLiveGames, LiveGame
 from .nhl.scrape_functions import scrape_games, scrape_date_range, scrape_seasons, scrape_schedule
 from .nhl import live_scrape
 from .utils import shared
-from . import utils
+from . import utils
+
+#from .nwhl import scrape_schedule as nwhl_scrape_schedule
diff --git a/hockey_scraper/nhl/json_schedule.py b/hockey_scraper/nhl/json_schedule.py
@@ -8,6 +8,9 @@
 import hockey_scraper.utils.shared as shared
 
 
+# TODO: Currently rescraping page each time since the status of some games may have changed
+# (e.g. Scraped on 2020-01-20 and game on 2020-01-21 was not Final...when use old page again will still think not Final)
+# Need to find a more elegant way of doing this
 def get_schedule(date_from, date_to):
     """
     Scrapes games in date range
@@ -25,7 +28,7 @@ def get_schedule(date_from, date_to):
         "season": shared.get_season(date_from),
     }
 
-    return json.loads(shared.get_file(page_info))
+    return json.loads(shared.get_file(page_info, force=True))
 
 
 def chunk_schedule_calls(from_date, to_date):
@@ -129,4 +132,5 @@ def scrape_schedule(date_from, date_to, preseason=False, not_over=False):
                                  "status": game["status"]["abstractGameState"]
                         })
 
+
     return schedule
diff --git a/hockey_scraper/nhl/scrape_functions.py b/hockey_scraper/nhl/scrape_functions.py
@@ -194,11 +194,11 @@ def scrape_seasons(seasons, if_scrape_shifts, data_format='csv', preseason=False
         if data_format.lower() == 'csv':
             shared.to_csv(str(season) + str(season + 1), pbp_df, "nhl", "pbp")
             shared.to_csv(str(season) + str(season + 1), shifts_df, "nhl", "shifts")
-        else:
+        elif pbp_df is not None:
             master_pbps.append(pbp_df)
             master_shifts.append(shifts_df)
 
-    if data_format.lower() == 'pandas':
+    if data_format.lower() == 'pandas' and master_pbps:
         if if_scrape_shifts:
             return {"pbp": pd.concat(master_pbps), "shifts": pd.concat(master_shifts)}
         else:

diff --git a/hockey_scraper/nwhl/scrape_functions.py b/hockey_scraper/nwhl/scrape_functions.py
@@ -4,7 +4,7 @@
 import random
 import pandas as pd
 
-from . import html_schedule, json_pbp
+#from . import html_schedule, json_pbp
 import hockey_scraper.utils.shared as shared
 
 # All columns for the pbp

diff --git a/hockey_scraper/nwhl/scrape_schedule.py b/hockey_scraper/nwhl/scrape_schedule.py
@@ -39,7 +39,7 @@ def scrape_dynamic(url):
     return pg
 
 
-def get_schedule(url, name):#
+def get_schedule(url, name):
     """
     Given a url it returns the raw html
 
@@ -50,7 +50,7 @@ def get_schedule(url, name):#
     """
     file_info = {
         "url": url,
-        "name": str(name),
+        "name": str(name) + "_schedule",
         "type": "html_schedule_nwhl",
         "season": "nwhl",
         'dir': shared.docs_dir
@@ -70,7 +70,7 @@ def get_season_codes():
     """
     They use fucked up codes instead of actual years to represent seasons in the url.
 
-    e.g. For 2015 - 'https://www.nwhl.zone/stats#/100/schedule?season_id=246
+    e.g. For 2019 - https://www.nwhl.zone/stats#/100/schedule?all&season_id=1950
 
     Instead of hardcoding it I just ping the base page and get the codes
 
@@ -96,7 +96,9 @@ def get_season_codes():
 
 def parse_game(game, season):
     """
-    Given a soup object for a given game parse out the info
+    Given a soup object for a given game parse out the info.
+
+    Skip over all-star game
 
     :param games: Soup object
     :param season: nwhl season
@@ -107,6 +109,10 @@ def parse_game(game, season):
 
     # Team info
     teams = game.find_all("span", {"class": "team-inline"})
+
+    if "All-Star" in teams[0].text:
+        return parsed_game
+
     parsed_game['away_team'] = teams[0].find("span").text
     parsed_game['home_team'] = teams[1].find("span").text
 
@@ -155,10 +161,9 @@ def get_season_games(season, season_code):
     games = sched.find_all("tr", {"class": re.compile("^ng-scope")})
 
     for game in games:
-        # For 2015 below each game is an additionak <tr> with the attendance
-        # We filter out by checking if the following attribute exists
-        if game.get('ng-if') is None:
-            parsed_games.append(parse_game(game, season))
+        g = parse_game(game, season)
+        if g:
+            parsed_games.append(g)
 
     return parsed_games
 

diff --git a/hockey_scraper/utils/save_pages.py b/hockey_scraper/utils/save_pages.py
@@ -17,9 +17,9 @@ def create_base_file_path(file_info):
     :return: path 
     """
     # Shitty fix for when you already have it saved but don't have nwhl folders
-    # if 'nwhl' in file_info['type']:
-    #     if not os.path.isdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'])):
-    #         os.mkdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type']))
+    if 'nwhl' in file_info['type']:
+        if not os.path.isdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'])):
+            os.mkdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type']))
 
     return os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'], file_info['name'] + ".txt")
 

diff --git a/hockey_scraper/utils/shared.py b/hockey_scraper/utils/shared.py
@@ -271,7 +271,7 @@ def add_dir(user_dir):
               "deposited in or recheck the directory you typed in and start again.\n")
 
 
-def get_file(file_info):
+def get_file(file_info, force=False):
     """
     Get the specified file.
 
@@ -280,13 +280,14 @@ def get_file(file_info):
 
     :param file_info: Dictionary containing the info for the file.
                       Contains the url, name, type, and season
+    :param force: Force a rescrape. Default is False
 
     :return: page
     """
     file_info['dir'] = docs_dir
 
     # If everything checks out we'll retrieve it, otherwise we scrape it
-    if docs_dir and sp.check_file_exists(file_info) and not re_scrape:
+    if docs_dir and sp.check_file_exists(file_info) and not re_scrape and not force:
         page = sp.get_page(file_info)
     else:
         page = scrape_page(file_info['url'])